-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
fetcher.go
1230 lines (1113 loc) · 42.1 KB
/
fetcher.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package row
import (
"bytes"
"context"
"fmt"
"strings"
"time"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/catpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc/keyside"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc/valueside"
"github.com/cockroachdb/cockroach/pkg/sql/rowinfra"
"github.com/cockroachdb/cockroach/pkg/sql/scrub"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sessiondatapb"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/encoding"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
)
// DebugRowFetch can be used to turn on some low-level debugging logs. We use
// this to avoid using log.V in the hot path.
const DebugRowFetch = false
// noOutputColumn is a sentinel value to denote that a system column is not
// part of the output.
const noOutputColumn = -1
// kvBatchFetcherResponse contains a response from the KVBatchFetcher.
type kvBatchFetcherResponse struct {
// moreKVs indicates whether this response contains some keys from the
// fetch.
//
// If true, then the fetch might (or might not) be complete at this point -
// another call to nextBatch() is needed to find out. If false, then the
// fetch has already been completed and this response doesn't have any
// fetched data.
moreKVs bool
// Only one of kvs and batchResponse will be set. Which one is set depends
// on the server version. Both must be handled by calling code.
//
// kvs, if set, is a slice of roachpb.KeyValue, the deserialized kv pairs
// that were fetched.
kvs []roachpb.KeyValue
// batchResponse, if set, is a packed byte slice containing the keys and
// values. An empty batchResponse indicates that nothing was fetched for the
// corresponding ScanRequest, and the caller is expected to skip over the
// response.
batchResponse []byte
// spanID is the ID associated with the span that generated this response.
spanID int
}
// KVBatchFetcher abstracts the logic of fetching KVs in batches.
type KVBatchFetcher interface {
// SetupNextFetch prepares the fetch of the next set of spans.
SetupNextFetch(
ctx context.Context,
spans roachpb.Spans,
spanIDs []int,
batchBytesLimit rowinfra.BytesLimit,
firstBatchKeyLimit rowinfra.KeyLimit,
) error
// nextBatch returns the next batch of rows. See kvBatchFetcherResponse for
// details on what is returned.
nextBatch(ctx context.Context) (kvBatchFetcherResponse, error)
close(ctx context.Context)
}
type tableInfo struct {
// -- Fields initialized once --
spec descpb.IndexFetchSpec
// The set of indexes into spec.FetchedColumns that are required for columns
// in the value part.
neededValueColsByIdx util.FastIntSet
// The number of needed columns from the value part of the row. Once we've
// seen this number of value columns for a particular row, we can stop
// decoding values in that row.
neededValueCols int
// Map used to get the index for columns in spec.FetchedColumns.
colIdxMap catalog.TableColMap
// One value per column that is part of the key; each value is a column index
// (into spec.FetchedColumns); -1 if we don't need the value for that column.
indexColIdx []int
// -- Fields updated during a scan --
keyVals []rowenc.EncDatum
extraVals []rowenc.EncDatum
row rowenc.EncDatumRow
decodedRow tree.Datums
// The following fields contain MVCC metadata for each row and may be
// returned to users of Fetcher immediately after NextRow returns.
//
// rowLastModified is the timestamp of the last time any family in the row
// was modified in any way.
rowLastModified hlc.Timestamp
// timestampOutputIdx controls at what row ordinal to write the timestamp.
timestampOutputIdx int
// Fields for outputting the tableoid system column.
tableOid tree.Datum
oidOutputIdx int
// rowIsDeleted is true when the row has been deleted. This is only
// meaningful when kv deletion tombstones are returned by the KVBatchFetcher,
// which the one used by `StartScan` (the common case) doesnt. Notably,
// changefeeds use this by providing raw kvs with tombstones unfiltered via
// `StartScanFrom`.
rowIsDeleted bool
}
// Fetcher handles fetching kvs and forming table rows for a single table.
// Usage:
// var rf Fetcher
// err := rf.Init(..)
// // Handle err
// err := rf.StartScan(..)
// // Handle err
// for {
// res, err := rf.NextRow()
// // Handle err
// if res.row == nil {
// // Done
// break
// }
// // Process res.row
// }
type Fetcher struct {
args FetcherInitArgs
table tableInfo
// True if the index key must be decoded. This is only false if there are no
// needed columns.
mustDecodeIndexKey bool
// mvccDecodeStrategy controls whether or not MVCC timestamps should
// be decoded from KV's fetched.
mvccDecodeStrategy MVCCDecodingStrategy
// -- Fields updated during a scan --
kvFetcher *KVFetcher
// indexKey stores the index key of the current row, up to (and not including)
// any family ID.
indexKey []byte
prettyValueBuf *bytes.Buffer
valueColsFound int // how many needed cols we've found so far in the value
// The current key/value, unless kvEnd is true.
kv roachpb.KeyValue
keyRemainingBytes []byte
kvEnd bool
// spanID is associated with the input span that produced the data in kv.
spanID int
// IgnoreUnexpectedNulls allows Fetcher to return null values for non-nullable
// columns and is only used for decoding for error messages or debugging.
IgnoreUnexpectedNulls bool
// Memory monitor and memory account for the bytes fetched by this fetcher.
mon *mon.BytesMonitor
kvFetcherMemAcc *mon.BoundAccount
}
// Reset resets this Fetcher, preserving the memory capacity that was used
// for the tables slice, and the slices within each of the tableInfo objects
// within tables. This permits reuse of this objects without forcing total
// reallocation of all of those slice fields.
func (rf *Fetcher) Reset() {
*rf = Fetcher{
table: rf.table,
}
}
// Close releases resources held by this fetcher.
func (rf *Fetcher) Close(ctx context.Context) {
if rf.kvFetcher != nil {
rf.kvFetcher.Close(ctx)
}
if rf.mon != nil {
rf.kvFetcherMemAcc.Close(ctx)
rf.mon.Stop(ctx)
}
}
// FetcherInitArgs contains arguments for Fetcher.Init.
type FetcherInitArgs struct {
// StreamingKVFetcher, if non-nil, contains the KVFetcher that uses the
// kvstreamer.Streamer API under the hood. The caller is then expected to
// use only StartScan() method.
StreamingKVFetcher *KVFetcher
// WillUseCustomKVFetcher, if true, indicates that the caller will only use
// StartScanFrom() method and will be providing its own KVFetcher.
WillUseCustomKVFetcher bool
// Txn is the txn for the fetch. It might be nil, and the caller is expected
// to either provide the txn later via SetTxn() or to only use StartScanFrom
// method.
Txn *kv.Txn
// Reverse denotes whether or not the spans should be read in reverse or not
// when StartScan* methods are invoked.
Reverse bool
// LockStrength represents the row-level locking mode to use when fetching
// rows.
LockStrength descpb.ScanLockingStrength
// LockWaitPolicy represents the policy to be used for handling conflicting
// locks held by other active transactions.
LockWaitPolicy descpb.ScanLockingWaitPolicy
// LockTimeout specifies the maximum amount of time that the fetcher will
// wait while attempting to acquire a lock on a key or while blocking on an
// existing lock in order to perform a non-locking read on a key.
LockTimeout time.Duration
// Alloc is used for buffered allocation of decoded datums.
Alloc *tree.DatumAlloc
MemMonitor *mon.BytesMonitor
Spec *descpb.IndexFetchSpec
// TraceKV indicates whether or not session tracing is enabled.
TraceKV bool
ForceProductionKVBatchSize bool
}
// Init sets up a Fetcher for a given table and index.
func (rf *Fetcher) Init(ctx context.Context, args FetcherInitArgs) error {
if args.Spec.Version != descpb.IndexFetchSpecVersionInitial {
return errors.Newf("unsupported IndexFetchSpec version %d", args.Spec.Version)
}
rf.args = args
if args.MemMonitor != nil {
rf.mon = mon.NewMonitorInheritWithLimit("fetcher-mem", 0 /* limit */, args.MemMonitor)
rf.mon.Start(ctx, args.MemMonitor, mon.BoundAccount{})
memAcc := rf.mon.MakeBoundAccount()
rf.kvFetcherMemAcc = &memAcc
}
table := &rf.table
*table = tableInfo{
spec: *args.Spec,
row: make(rowenc.EncDatumRow, len(args.Spec.FetchedColumns)),
decodedRow: make(tree.Datums, len(args.Spec.FetchedColumns)),
// These slice fields might get re-allocated below, so reslice them from
// the old table here in case they've got enough capacity already.
indexColIdx: rf.table.indexColIdx[:0],
keyVals: rf.table.keyVals[:0],
extraVals: rf.table.extraVals[:0],
timestampOutputIdx: noOutputColumn,
oidOutputIdx: noOutputColumn,
}
for idx := range args.Spec.FetchedColumns {
colID := args.Spec.FetchedColumns[idx].ColumnID
table.colIdxMap.Set(colID, idx)
if colinfo.IsColIDSystemColumn(colID) {
switch colinfo.GetSystemColumnKindFromColumnID(colID) {
case catpb.SystemColumnKind_MVCCTIMESTAMP:
table.timestampOutputIdx = idx
rf.mvccDecodeStrategy = MVCCDecodingRequired
case catpb.SystemColumnKind_TABLEOID:
table.oidOutputIdx = idx
table.tableOid = tree.NewDOid(tree.DInt(args.Spec.TableID))
}
}
}
if len(args.Spec.FetchedColumns) > 0 {
table.neededValueColsByIdx.AddRange(0, len(args.Spec.FetchedColumns)-1)
}
nExtraCols := 0
// Unique secondary indexes have extra columns to decode from the value (namely
// the primary index columns).
if table.spec.IsSecondaryIndex && table.spec.IsUniqueIndex {
nExtraCols = int(table.spec.NumKeySuffixColumns)
}
nIndexCols := len(args.Spec.KeyAndSuffixColumns) - nExtraCols
neededIndexCols := 0
compositeIndexCols := 0
if cap(table.indexColIdx) >= nIndexCols {
table.indexColIdx = table.indexColIdx[:nIndexCols]
} else {
table.indexColIdx = make([]int, nIndexCols)
}
for i := 0; i < nIndexCols; i++ {
id := args.Spec.KeyAndSuffixColumns[i].ColumnID
colIdx, ok := table.colIdxMap.Get(id)
if ok {
table.indexColIdx[i] = colIdx
neededIndexCols++
table.neededValueColsByIdx.Remove(colIdx)
} else {
table.indexColIdx[i] = -1
}
if args.Spec.KeyAndSuffixColumns[i].IsComposite {
compositeIndexCols++
}
}
// If there are needed columns from the index key, we need to read it;
// otherwise, we can completely avoid decoding the index key.
rf.mustDecodeIndexKey = neededIndexCols > 0
// The number of columns we need to read from the value part of the key.
// It's the total number of needed columns minus the ones we read from the
// index key, except for composite columns.
table.neededValueCols = len(args.Spec.FetchedColumns) - neededIndexCols + compositeIndexCols
if cap(table.keyVals) >= nIndexCols {
table.keyVals = table.keyVals[:nIndexCols]
} else {
table.keyVals = make([]rowenc.EncDatum, nIndexCols)
}
if nExtraCols > 0 {
// Unique secondary indexes have a value that is the
// primary index key.
// Primary indexes only contain ascendingly-encoded
// values. If this ever changes, we'll probably have to
// figure out the directions here too.
if cap(table.extraVals) >= nExtraCols {
table.extraVals = table.extraVals[:nExtraCols]
} else {
table.extraVals = make([]rowenc.EncDatum, nExtraCols)
}
}
if args.StreamingKVFetcher != nil {
if args.WillUseCustomKVFetcher {
return errors.AssertionFailedf(
"StreamingKVFetcher is non-nil when WillUseCustomKVFetcher is true",
)
}
rf.kvFetcher = args.StreamingKVFetcher
} else if !args.WillUseCustomKVFetcher {
fetcherArgs := kvBatchFetcherArgs{
reverse: args.Reverse,
lockStrength: args.LockStrength,
lockWaitPolicy: args.LockWaitPolicy,
lockTimeout: args.LockTimeout,
acc: rf.kvFetcherMemAcc,
forceProductionKVBatchSize: args.ForceProductionKVBatchSize,
}
if args.Txn != nil {
fetcherArgs.sendFn = makeKVBatchFetcherDefaultSendFunc(args.Txn)
fetcherArgs.requestAdmissionHeader = args.Txn.AdmissionHeader()
fetcherArgs.responseAdmissionQ = args.Txn.DB().SQLKVResponseAdmissionQ
}
rf.kvFetcher = newKVFetcher(newKVBatchFetcher(fetcherArgs))
}
return nil
}
// SetTxn updates the Fetcher to use the provided txn.
//
// This method is designed to support two use cases:
// - providing the Fetcher with the txn when the txn was not available during
// Fetcher.Init;
// - allowing the caller to update the Fetcher to use the new txn. In this case,
// the caller should be careful since reads performed under different txns
// do not provide consistent view of the data.
func (rf *Fetcher) SetTxn(txn *kv.Txn) {
rf.setTxnAndSendFn(txn, makeKVBatchFetcherDefaultSendFunc(txn))
}
// setTxnAndSendFn peeks inside of the KVFetcher to update the underlying
// txnKVFetcher with the new txn and sendFn.
func (rf *Fetcher) setTxnAndSendFn(txn *kv.Txn, sendFn sendFunc) {
f := rf.kvFetcher.KVBatchFetcher.(*txnKVFetcher)
f.sendFn = sendFn
f.requestAdmissionHeader = txn.AdmissionHeader()
f.responseAdmissionQ = txn.DB().SQLKVResponseAdmissionQ
}
// StartScan initializes and starts the key-value scan. Can be used multiple
// times. Cannot be used if WillUseCustomKVFetcher was set to true in Init().
//
// The fetcher takes ownership of the spans slice - it can modify the slice and
// will perform the memory accounting accordingly (if Init() was called with
// non-nil memory monitor). The caller can only reuse the spans slice after the
// fetcher has been closed, and if the caller does, it becomes responsible for
// the memory accounting.
//
// The fetcher also takes ownership of the spanIDs slice - it can modify the
// slice, but it will **not** perform the memory accounting. It is the caller's
// responsibility to track the memory under the spanIDs slice, and the slice
// can only be reused once the fetcher has been closed. Notably, the capacity of
// the slice will not be increased by the fetcher.
//
// spanIDs is a slice of user-defined identifiers of spans. If spanIDs is
// non-nil, then it must be of the same length as spans, and some methods of the
// Fetcher will return the fetched row with the span ID that corresponds to the
// original span that the row belongs to. Nil spanIDs can be used to indicate
// that the caller is not interested in that information.
//
// batchBytesLimit controls whether bytes limits are placed on the batches. If
// set, bytes limits will be used to protect against running out of memory (on
// both this client node, and on the server).
//
// If batchBytesLimit is set, rowLimitHint can also be set to control the number of
// rows that will be scanned by the first batch. If set, subsequent batches (if
// any) will have progressively higher limits (up to a fixed max). The idea with
// row limits is to make the execution of LIMIT queries efficient: if the caller
// has some idea about how many rows need to be read to ultimately satisfy the
// query, the Fetcher uses it. Even if this hint proves insufficient, the
// Fetcher continues to set row limits (in addition to bytes limits) on the
// argument that some number of rows will eventually satisfy the query and we
// likely don't need to scan `spans` fully. The bytes limit, on the other hand,
// is simply intended to protect against OOMs.
//
// Batch limits can only be used if the spans are ordered.
func (rf *Fetcher) StartScan(
ctx context.Context,
spans roachpb.Spans,
spanIDs []int,
batchBytesLimit rowinfra.BytesLimit,
rowLimitHint rowinfra.RowLimit,
) error {
if rf.args.WillUseCustomKVFetcher {
return errors.AssertionFailedf("StartScan is called instead of StartScanFrom")
}
if len(spans) == 0 {
return errors.AssertionFailedf("no spans")
}
if err := rf.kvFetcher.SetupNextFetch(
ctx, spans, spanIDs, batchBytesLimit, rf.rowLimitToKeyLimit(rowLimitHint),
); err != nil {
return err
}
return rf.startScan(ctx)
}
// TestingInconsistentScanSleep introduces a sleep inside the fetcher after
// every KV batch (for inconsistent scans, currently used only for table
// statistics collection).
// TODO(radu): consolidate with forceProductionKVBatchSize into a
// FetcherTestingKnobs struct.
var TestingInconsistentScanSleep time.Duration
// StartInconsistentScan initializes and starts an inconsistent scan, where each
// KV batch can be read at a different historical timestamp.
//
// The scan uses the initial timestamp, until it becomes older than
// maxTimestampAge; at this time the timestamp is bumped by the amount of time
// that has passed. See the documentation for TableReaderSpec for more
// details.
//
// Can be used multiple times. Cannot be used if WillUseCustomKVFetcher was set
// to true in Init().
//
// Batch limits can only be used if the spans are ordered.
func (rf *Fetcher) StartInconsistentScan(
ctx context.Context,
db *kv.DB,
initialTimestamp hlc.Timestamp,
maxTimestampAge time.Duration,
spans roachpb.Spans,
batchBytesLimit rowinfra.BytesLimit,
rowLimitHint rowinfra.RowLimit,
qualityOfService sessiondatapb.QoSLevel,
) error {
if rf.args.StreamingKVFetcher != nil {
return errors.AssertionFailedf("StartInconsistentScan is called instead of StartScanFrom")
}
if rf.args.WillUseCustomKVFetcher {
return errors.AssertionFailedf("StartInconsistentScan is called instead of StartScanFrom")
}
if len(spans) == 0 {
return errors.AssertionFailedf("no spans")
}
txnTimestamp := initialTimestamp
txnStartTime := timeutil.Now()
if txnStartTime.Sub(txnTimestamp.GoTime()) >= maxTimestampAge {
return errors.Errorf(
"AS OF SYSTEM TIME: cannot specify timestamp older than %s for this operation",
maxTimestampAge,
)
}
txn := kv.NewTxnWithSteppingEnabled(ctx, db, 0 /* gatewayNodeID */, qualityOfService)
if err := txn.SetFixedTimestamp(ctx, txnTimestamp); err != nil {
return err
}
if log.V(1) {
log.Infof(ctx, "starting inconsistent scan at timestamp %v", txnTimestamp)
}
sendFn := func(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, error) {
if now := timeutil.Now(); now.Sub(txnTimestamp.GoTime()) >= maxTimestampAge {
// Time to bump the transaction. First commit the old one (should be a no-op).
if err := txn.Commit(ctx); err != nil {
return nil, err
}
// Advance the timestamp by the time that passed.
txnTimestamp = txnTimestamp.Add(now.Sub(txnStartTime).Nanoseconds(), 0 /* logical */)
txnStartTime = now
txn = kv.NewTxnWithSteppingEnabled(ctx, db, 0 /* gatewayNodeID */, qualityOfService)
if err := txn.SetFixedTimestamp(ctx, txnTimestamp); err != nil {
return nil, err
}
if log.V(1) {
log.Infof(ctx, "bumped inconsistent scan timestamp to %v", txnTimestamp)
}
}
res, err := txn.Send(ctx, ba)
if err != nil {
return nil, err.GoError()
}
if TestingInconsistentScanSleep != 0 {
time.Sleep(TestingInconsistentScanSleep)
}
return res, nil
}
// TODO(radu): we should commit the last txn. Right now the commit is a no-op
// on read transactions, but perhaps one day it will release some resources.
rf.setTxnAndSendFn(txn, sendFn)
if err := rf.kvFetcher.SetupNextFetch(
ctx, spans, nil /* spanIDs */, batchBytesLimit, rf.rowLimitToKeyLimit(rowLimitHint),
); err != nil {
return err
}
return rf.startScan(ctx)
}
func (rf *Fetcher) rowLimitToKeyLimit(rowLimitHint rowinfra.RowLimit) rowinfra.KeyLimit {
if rowLimitHint == 0 {
return 0
}
// If we have a limit hint, we limit the first batch size. Subsequent
// batches get larger to avoid making things too slow (e.g. in case we have
// a very restrictive filter and actually have to retrieve a lot of rows).
// The rowLimitHint is a row limit, but each row could be made up of more than
// one key. We take the maximum possible keys per row out of all the table
// rows we could potentially scan over.
//
// We add an extra key to make sure we form the last row.
return rowinfra.KeyLimit(int64(rowLimitHint)*int64(rf.table.spec.MaxKeysPerRow) + 1)
}
// StartScanFrom initializes and starts a scan from the given KVBatchFetcher.
// Can be used multiple times. Cannot be used if WillUseCustomKVFetcher was set
// to false in Init().
func (rf *Fetcher) StartScanFrom(ctx context.Context, f KVBatchFetcher) error {
if !rf.args.WillUseCustomKVFetcher {
return errors.AssertionFailedf("StartScanFrom is called instead of StartScan")
}
if rf.kvFetcher != nil {
rf.kvFetcher.Close(ctx)
}
rf.kvFetcher = newKVFetcher(f)
return rf.startScan(ctx)
}
func (rf *Fetcher) startScan(ctx context.Context) error {
rf.indexKey = nil
rf.kvEnd = false
// Retrieve the first key.
var err error
_, rf.spanID, err = rf.nextKey(ctx)
return err
}
// setNextKV sets the next KV to process to the input KV. needsCopy, if true,
// causes the input kv to be deep copied. needsCopy should be set to true if
// the input KV is pointing to the last KV of a batch, so that the batch can
// be garbage collected before fetching the next one.
// gcassert:inline
func (rf *Fetcher) setNextKV(kv roachpb.KeyValue, needsCopy bool) {
if !needsCopy {
rf.kv = kv
return
}
// If we've made it to the very last key in the batch, copy out the key
// so that the GC can reclaim the large backing slice before we call
// NextKV() again.
kvCopy := roachpb.KeyValue{}
kvCopy.Key = make(roachpb.Key, len(kv.Key))
copy(kvCopy.Key, kv.Key)
kvCopy.Value.RawBytes = make([]byte, len(kv.Value.RawBytes))
copy(kvCopy.Value.RawBytes, kv.Value.RawBytes)
kvCopy.Value.Timestamp = kv.Value.Timestamp
rf.kv = kvCopy
}
// nextKey retrieves the next key/value and sets kv/kvEnd. Returns whether the
// key indicates a new row (as opposed to another family for the current row).
func (rf *Fetcher) nextKey(ctx context.Context) (newRow bool, spanID int, _ error) {
ok, kv, spanID, finalReferenceToBatch, err := rf.kvFetcher.NextKV(ctx, rf.mvccDecodeStrategy)
if err != nil {
return false, 0, ConvertFetchError(&rf.table.spec, err)
}
rf.setNextKV(kv, finalReferenceToBatch)
if !ok {
// No more keys in the scan.
rf.kvEnd = true
return true, 0, nil
}
// unchangedPrefix will be set to true if the current KV belongs to the same
// row as the previous KV (i.e. the last and current keys have identical
// prefix). In this case, we can skip decoding the index key completely.
unchangedPrefix := rf.table.spec.MaxKeysPerRow > 1 && rf.indexKey != nil && bytes.HasPrefix(rf.kv.Key, rf.indexKey)
if unchangedPrefix {
// Skip decoding!
rf.keyRemainingBytes = rf.kv.Key[len(rf.indexKey):]
return false, spanID, nil
}
// The current key belongs to a new row.
if rf.mustDecodeIndexKey {
var foundNull bool
rf.keyRemainingBytes, foundNull, err = rf.DecodeIndexKey(rf.kv.Key)
if err != nil {
return false, 0, err
}
// For unique secondary indexes, the index-key does not distinguish one row
// from the next if both rows contain identical values along with a NULL.
// Consider the keys:
//
// /test/unique_idx/NULL/0
// /test/unique_idx/NULL/1
//
// The index-key extracted from the above keys is /test/unique_idx/NULL. The
// trailing /0 and /1 are the primary key used to unique-ify the keys when a
// NULL is present. When a null is present in the index key, we cut off more
// of the index key so that the prefix includes the primary key columns.
//
// Note that we do not need to do this for non-unique secondary indexes because
// the extra columns in the primary key will _always_ be there, so we can decode
// them when processing the index. The difference with unique secondary indexes
// is that the extra columns are not always there, and are used to unique-ify
// the index key, rather than provide the primary key column values.
if foundNull && rf.table.spec.IsSecondaryIndex && rf.table.spec.IsUniqueIndex && rf.table.spec.MaxKeysPerRow > 1 {
for i := 0; i < int(rf.table.spec.NumKeySuffixColumns); i++ {
var err error
// Slice off an extra encoded column from rf.keyRemainingBytes.
rf.keyRemainingBytes, err = keyside.Skip(rf.keyRemainingBytes)
if err != nil {
return false, 0, err
}
}
}
} else {
// We still need to consume the key until the family
// id, so processKV can know whether we've finished a
// row or not.
prefixLen, err := keys.GetRowPrefixLength(rf.kv.Key)
if err != nil {
return false, 0, err
}
rf.keyRemainingBytes = rf.kv.Key[prefixLen:]
}
rf.indexKey = nil
return true, spanID, nil
}
func (rf *Fetcher) prettyKeyDatums(
cols []descpb.IndexFetchSpec_KeyColumn, vals []rowenc.EncDatum,
) string {
var buf strings.Builder
for i, v := range vals {
buf.WriteByte('/')
if err := v.EnsureDecoded(cols[i].Type, rf.args.Alloc); err != nil {
buf.WriteByte('?')
} else {
buf.WriteString(v.Datum.String())
}
}
return buf.String()
}
// DecodeIndexKey decodes an index key and returns the remaining key and whether
// it encountered a null while decoding.
func (rf *Fetcher) DecodeIndexKey(key roachpb.Key) (remaining []byte, foundNull bool, err error) {
key = key[rf.table.spec.KeyPrefixLength:]
return rowenc.DecodeKeyValsUsingSpec(rf.table.spec.KeyAndSuffixColumns, key, rf.table.keyVals)
}
// processKV processes the given key/value, setting values in the row
// accordingly. If debugStrings is true, returns pretty printed key and value
// information in prettyKey/prettyValue (otherwise they are empty strings).
func (rf *Fetcher) processKV(
ctx context.Context, kv roachpb.KeyValue,
) (prettyKey string, prettyValue string, err error) {
table := &rf.table
if rf.args.TraceKV {
prettyKey = fmt.Sprintf(
"/%s/%s%s",
table.spec.TableName,
table.spec.IndexName,
rf.prettyKeyDatums(table.spec.KeyAndSuffixColumns, table.keyVals),
)
}
// Either this is the first key of the fetch or the first key of a new
// row.
if rf.indexKey == nil {
// This is the first key for the row.
rf.indexKey = []byte(kv.Key[:len(kv.Key)-len(rf.keyRemainingBytes)])
// Reset the row to nil; it will get filled in with the column
// values as we decode the key-value pairs for the row.
// We only need to reset the needed columns in the value component, because
// non-needed columns are never set and key columns are unconditionally set
// below.
for idx, ok := table.neededValueColsByIdx.Next(0); ok; idx, ok = table.neededValueColsByIdx.Next(idx + 1) {
table.row[idx].UnsetDatum()
}
// Fill in the column values that are part of the index key.
for i := range table.keyVals {
if idx := table.indexColIdx[i]; idx != -1 {
table.row[idx] = table.keyVals[i]
}
}
rf.valueColsFound = 0
// Reset the MVCC metadata for the next row.
// set rowLastModified to a sentinel that's before any real timestamp.
// As kvs are iterated for this row, it keeps track of the greatest
// timestamp seen.
table.rowLastModified = hlc.Timestamp{}
// All row encodings (both before and after column families) have a
// sentinel kv (column family 0) that is always present when a row is
// present, even if that row is all NULLs. Thus, a row is deleted if and
// only if the first kv in it a tombstone (RawBytes is empty).
table.rowIsDeleted = len(kv.Value.RawBytes) == 0
}
if table.rowLastModified.Less(kv.Value.Timestamp) {
table.rowLastModified = kv.Value.Timestamp
}
if len(table.spec.FetchedColumns) == 0 {
// We don't need to decode any values.
if rf.args.TraceKV {
prettyValue = "<undecoded>"
}
return prettyKey, prettyValue, nil
}
// For covering secondary indexes, allow for decoding as a primary key.
if table.spec.EncodingType == descpb.PrimaryIndexEncoding &&
len(rf.keyRemainingBytes) > 0 {
// If familyID is 0, kv.Value contains values for composite key columns.
// These columns already have a table.row value assigned above, but that value
// (obtained from the key encoding) might not be correct (e.g. for decimals,
// it might not contain the right number of trailing 0s; for collated
// strings, it is one of potentially many strings with the same collation
// key).
//
// In these cases, the correct value will be present in family 0 and the
// table.row value gets overwritten.
switch kv.Value.GetTag() {
case roachpb.ValueType_TUPLE:
// In this case, we don't need to decode the column family ID, because
// the ValueType_TUPLE encoding includes the column id with every encoded
// column value.
var tupleBytes []byte
tupleBytes, err = kv.Value.GetTuple()
if err != nil {
break
}
prettyKey, prettyValue, err = rf.processValueBytes(ctx, table, kv, tupleBytes, prettyKey)
default:
var familyID uint64
_, familyID, err = encoding.DecodeUvarintAscending(rf.keyRemainingBytes)
if err != nil {
return "", "", scrub.WrapError(scrub.IndexKeyDecodingError, err)
}
// If familyID is 0, this is the row sentinel (in the legacy pre-family format),
// and a value is not expected, so we're done.
if familyID != 0 {
// Find the default column ID for the family.
var defaultColumnID descpb.ColumnID
for _, f := range table.spec.FamilyDefaultColumns {
if f.FamilyID == descpb.FamilyID(familyID) {
defaultColumnID = f.DefaultColumnID
break
}
}
if defaultColumnID == 0 {
return "", "", errors.Errorf("single entry value with no default column id")
}
prettyKey, prettyValue, err = rf.processValueSingle(ctx, table, defaultColumnID, kv, prettyKey)
}
}
if err != nil {
return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
}
} else {
tag := kv.Value.GetTag()
var valueBytes []byte
switch tag {
case roachpb.ValueType_BYTES:
// If we have the ValueType_BYTES on a secondary index, then we know we
// are looking at column family 0. Column family 0 stores the extra primary
// key columns if they are present, so we decode them here.
valueBytes, err = kv.Value.GetBytes()
if err != nil {
return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
}
if len(table.extraVals) > 0 {
extraCols := table.spec.KeySuffixColumns()
// This is a unique secondary index; decode the extra
// column values from the value.
var err error
valueBytes, _, err = rowenc.DecodeKeyValsUsingSpec(
extraCols,
valueBytes,
table.extraVals,
)
if err != nil {
return "", "", scrub.WrapError(scrub.SecondaryIndexKeyExtraValueDecodingError, err)
}
for i := range extraCols {
if idx, ok := table.colIdxMap.Get(extraCols[i].ColumnID); ok {
table.row[idx] = table.extraVals[i]
}
}
if rf.args.TraceKV {
prettyValue = rf.prettyKeyDatums(extraCols, table.extraVals)
}
}
case roachpb.ValueType_TUPLE:
valueBytes, err = kv.Value.GetTuple()
if err != nil {
return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
}
}
if len(valueBytes) > 0 {
prettyKey, prettyValue, err = rf.processValueBytes(
ctx, table, kv, valueBytes, prettyKey,
)
if err != nil {
return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
}
}
}
if rf.args.TraceKV && prettyValue == "" {
prettyValue = "<undecoded>"
}
return prettyKey, prettyValue, nil
}
// processValueSingle processes the given value (of column colID), setting
// values in table.row accordingly. The key is only used for logging.
func (rf *Fetcher) processValueSingle(
ctx context.Context,
table *tableInfo,
colID descpb.ColumnID,
kv roachpb.KeyValue,
prettyKeyPrefix string,
) (prettyKey string, prettyValue string, err error) {
prettyKey = prettyKeyPrefix
idx, ok := table.colIdxMap.Get(colID)
if !ok {
// No need to unmarshal the column value. Either the column was part of
// the index key or it isn't needed.
if DebugRowFetch {
log.Infof(ctx, "Scan %s -> [%d] (skipped)", kv.Key, colID)
}
return prettyKey, "", nil
}
if rf.args.TraceKV {
prettyKey = fmt.Sprintf("%s/%s", prettyKey, table.spec.FetchedColumns[idx].Name)
}
if len(kv.Value.RawBytes) == 0 {
return prettyKey, "", nil
}
typ := table.spec.FetchedColumns[idx].Type
// TODO(arjun): The value is a directly marshaled single value, so we
// unmarshal it eagerly here. This can potentially be optimized out,
// although that would require changing UnmarshalColumnValue to operate
// on bytes, and for Encode/DecodeTableValue to operate on marshaled
// single values.
value, err := valueside.UnmarshalLegacy(rf.args.Alloc, typ, kv.Value)
if err != nil {
return "", "", err
}
if rf.args.TraceKV {
prettyValue = value.String()
}
table.row[idx] = rowenc.DatumToEncDatum(typ, value)
if DebugRowFetch {
log.Infof(ctx, "Scan %s -> %v", kv.Key, value)
}
return prettyKey, prettyValue, nil
}
func (rf *Fetcher) processValueBytes(
ctx context.Context,
table *tableInfo,
kv roachpb.KeyValue,
valueBytes []byte,
prettyKeyPrefix string,
) (prettyKey string, prettyValue string, err error) {
prettyKey = prettyKeyPrefix
if rf.args.TraceKV {
if rf.prettyValueBuf == nil {
rf.prettyValueBuf = &bytes.Buffer{}
}
rf.prettyValueBuf.Reset()
}
var colIDDiff uint32
var lastColID descpb.ColumnID
var typeOffset, dataOffset int
var typ encoding.Type
for len(valueBytes) > 0 && rf.valueColsFound < table.neededValueCols {
typeOffset, dataOffset, colIDDiff, typ, err = encoding.DecodeValueTag(valueBytes)
if err != nil {
return "", "", err
}
colID := lastColID + descpb.ColumnID(colIDDiff)
lastColID = colID
idx, ok := table.colIdxMap.Get(colID)
if !ok {
// This column wasn't requested, so read its length and skip it.
numBytes, err := encoding.PeekValueLengthWithOffsetsAndType(valueBytes, dataOffset, typ)
if err != nil {
return "", "", err
}
valueBytes = valueBytes[numBytes:]
if DebugRowFetch {
log.Infof(ctx, "Scan %s -> [%d] (skipped)", kv.Key, colID)
}
continue
}
if rf.args.TraceKV {
prettyKey = fmt.Sprintf("%s/%s", prettyKey, table.spec.FetchedColumns[idx].Name)
}
var encValue rowenc.EncDatum
encValue, valueBytes, err = rowenc.EncDatumValueFromBufferWithOffsetsAndType(valueBytes, typeOffset,
dataOffset, typ)
if err != nil {
return "", "", err
}
if rf.args.TraceKV {
err := encValue.EnsureDecoded(table.spec.FetchedColumns[idx].Type, rf.args.Alloc)
if err != nil {