-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
streamer.go
1560 lines (1472 loc) · 59.3 KB
/
streamer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package kvstreamer
import (
"context"
"fmt"
"math"
"runtime"
"sort"
"sync"
"unsafe"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/util/admission"
"github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb"
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/quotapool"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/errors"
)
// TODO(yuzefovich): remove this once the Streamer is stabilized.
const debug = false
// OperationMode describes the mode of operation of the Streamer.
type OperationMode int
const (
_ OperationMode = iota
// InOrder is the mode of operation in which the results are delivered in
// the order in which the requests were handed off to the Streamer. This
// mode forces the Streamer to buffer the results it produces through its
// internal parallel execution of the requests. Since the results of the
// concurrent requests can come in an arbitrary order, they are buffered and
// might end up being dropped (resulting in wasted/duplicate work) to make
// space for the results at the front of the line. This would occur when the
// budget limitBytes is reached and the size estimates that lead to too much
// concurrency in the execution were wrong.
InOrder
// OutOfOrder is the mode of operation in which the results are delivered in
// the order in which they're produced. The caller will use the keys field
// of each Result to associate it with the corresponding requests. This mode
// of operation lets the Streamer reuse the memory budget as quickly as
// possible.
OutOfOrder
)
// Result describes the result of performing a single KV request.
//
// The recipient of the Result is required to call Release() when the Result is
// not in use any more so that its memory is returned to the Streamer's budget.
type Result struct {
// GetResp and ScanResp represent the response to a request. Only one of the
// two will be populated.
//
// The responses are to be considered immutable; the Streamer might hold on
// to the respective memory. Calling Result.Release() tells the Streamer
// that the response is no longer needed.
//
// GetResp is guaranteed to have nil IntentValue.
GetResp *roachpb.GetResponse
// ScanResp can contain a partial response to a ScanRequest (when Complete
// is false). In that case, there will be a further result with the
// continuation; that result will use the same Key. Notably, SQL rows will
// never be split across multiple results.
ScanResp struct {
// The response is always using BATCH_RESPONSE format (meaning that Rows
// field is always nil). IntentRows field is also nil.
*roachpb.ScanResponse
// If the Result represents a scan result, Complete indicates whether
// this is the last response for the respective scan, or if there are
// more responses to come. In any case, ScanResp never contains partial
// rows (i.e. a single row is never split into different Results).
//
// When running in InOrder mode, Results for a single scan will be
// delivered in key order (in addition to results for different scans
// being delivered in request order). When running in OutOfOrder mode,
// Results for a single scan can be delivered out of key order (in
// addition to results for different scans being delivered out of
// request order).
Complete bool
}
// EnqueueKeysSatisfied identifies the requests that this Result satisfies.
// In OutOfOrder mode, a single Result can satisfy multiple identical
// requests. In InOrder mode a Result can only satisfy multiple consecutive
// requests.
EnqueueKeysSatisfied []int
// memoryTok describes the memory reservation of this Result that needs to
// be released back to the Streamer's budget when the Result is Release()'d.
memoryTok struct {
streamer *Streamer
toRelease int64
}
// Position tracks the ordinal among all originally enqueued requests that
// this result satisfies. See singleRangeBatch.positions for more details.
//
// If Streamer.Enqueue() was called with nil enqueueKeys argument, then
// EnqueueKeysSatisfied will exactly contain Position; if non-nil
// enqueueKeys argument was passed, then Position is used as an ordinal to
// lookup into enqueueKeys to populate EnqueueKeysSatisfied.
// TODO(yuzefovich): this might need to be []int when non-unique requests
// are supported.
Position int
// subRequestIdx allows us to order two Results that come for the same
// original Scan request but from different ranges. It is non-zero only in
// InOrder mode when Hints.SingleRowLookup is false, in all other cases it
// will remain zero. See singleRangeBatch.subRequestIdx for more details.
subRequestIdx int
// subRequestDone is true if the current Result is the last one for the
// corresponding sub-request. For all Get requests and for Scan requests
// contained within a single range, it is always true since those can only
// have a single sub-request.
//
// Note that for correctness, it is only necessary that this value is set
// properly if this Result is a Scan response and Hints.SingleRowLookup is
// false.
subRequestDone bool
}
// Hints provides different hints to the Streamer for optimization purposes.
type Hints struct {
// UniqueRequests tells the Streamer that the requests will be unique. As
// such, there's no point in de-duping them or caching results.
UniqueRequests bool
// SingleRowLookup tells the Streamer that each enqueued request will result
// in a single row lookup (in other words, the request contains a "key"). If
// true, then the Streamer knows that no request will be split across
// multiple ranges, so some internal state can be optimized away.
SingleRowLookup bool
}
// Release needs to be called by the recipient of the Result exactly once when
// this Result is not needed any more. If this was the last (or only) reference
// to this Result, the memory used by this Result is made available in the
// Streamer's budget.
//
// Internally, Results are refcounted. Multiple Results referencing the same
// GetResp/ScanResp can be returned from separate `GetResults()` calls, and the
// Streamer internally does buffering and caching of Results - which also
// contributes to the refcounts.
func (r Result) Release(ctx context.Context) {
if s := r.memoryTok.streamer; s != nil {
s.results.releaseOne()
s.budget.mu.Lock()
defer s.budget.mu.Unlock()
s.budget.releaseLocked(ctx, r.memoryTok.toRelease)
s.mu.Lock()
defer s.mu.Unlock()
s.signalBudgetIfNoRequestsInProgressLocked()
}
}
// Streamer provides a streaming oriented API for reading from the KV layer.
//
// The example usage is roughly as follows:
//
// s := NewStreamer(...)
// s.Init(OperationMode, Hints)
// ...
// for needMoreKVs {
// // Check whether there are results to the previously enqueued requests.
// // This will block if no results are available, but there are some
// // enqueued requests.
// results, err := s.GetResults(ctx)
// // err check
// ...
// if len(results) > 0 {
// processResults(results)
// // return to the client
// ...
// // when results are no longer needed, Release() them
// }
// // All previously enqueued requests have already been responded to.
// if moreRequestsToEnqueue {
// err := s.Enqueue(ctx, requests, enqueueKeys)
// // err check
// ...
// } else {
// // done
// ...
// }
// }
// ...
// s.Close()
//
// The Streamer builds on top of the BatchRequest API provided by the DistSender
// and aims to allow for executing the requests in parallel (to improve the
// performance) while setting the memory limits on those requests (for stability
// purposes).
//
// The parallelism is achieved by splitting the incoming requests into
// single-range batches where each such batch will hit a fast-path in the
// DistSender (unless there have been changes to range boundaries). Since these
// batches are executed concurrently, the LeafTxns are used.
//
// The memory limit handling is achieved by the Streamer guessing the size of
// the response for each request and setting TargetBytes accordingly. The
// concurrency of the Streamer is limited by its memory limit.
//
// The Streamer additionally utilizes different optimizations to improve the
// performance:
// - when possible, sorting requests in key order to take advantage of low-level
// Pebble locality optimizations
// - when necessary, buffering the responses received out of order
// - when necessary, caching the responses to short-circuit repeated lookups.
// TODO(yuzefovich): add an optimization of transparent refreshes when there is
// a single Streamer in the local flow.
// TODO(yuzefovich): support pipelining of Enqueue and GetResults calls.
type Streamer struct {
distSender *kvcoord.DistSender
stopper *stop.Stopper
mode OperationMode
hints Hints
maxKeysPerRow int32
budget *budget
coordinator workerCoordinator
coordinatorStarted bool
coordinatorCtxCancel context.CancelFunc
waitGroup sync.WaitGroup
enqueueKeys []int
// requestsToServe contains all single-range sub-requests that have yet
// to be served.
requestsToServe requestsProvider
// results are the results of already completed requests that haven't
// been returned by GetResults() yet.
results resultsBuffer
mu struct {
// If the budget's mutex also needs to be locked, the budget's mutex
// must be acquired first.
syncutil.Mutex
avgResponseEstimator avgResponseEstimator
// In OutOfOrder mode, numRangesPerScanRequest tracks how many
// ranges a particular originally enqueued ScanRequest touches, but
// scanning of those ranges isn't complete.
//
// In InOrder mode, it tracks how many ranges a particular originally
// enqueued ScanRequest touches. In other words, it contains how many
// "sub-requests" the original Scan request was broken down into.
//
// It is allocated lazily if Hints.SingleRowLookup is false when the
// first ScanRequest is encountered in Enqueue.
// TODO(yuzefovich): perform memory accounting for this.
numRangesPerScanRequest []int
// numRequestsInFlight tracks the number of single-range batches that
// are currently being served asynchronously (i.e. those that have
// already left requestsToServe queue, but for which we haven't received
// the results yet).
numRequestsInFlight int
// done is set to true once the Streamer is closed meaning the worker
// coordinator must exit.
done bool
}
}
// streamerConcurrencyLimit is an upper bound on the number of asynchronous
// requests that a single Streamer can have in flight. The default value for
// this setting is chosen arbitrarily as 1/8th of the default value for the
// senderConcurrencyLimit.
var streamerConcurrencyLimit = settings.RegisterIntSetting(
settings.TenantWritable,
"kv.streamer.concurrency_limit",
"maximum number of asynchronous requests by a single streamer",
max(128, int64(8*runtime.GOMAXPROCS(0))),
settings.PositiveInt,
)
func max(a, b int64) int64 {
if a > b {
return a
}
return b
}
// NewStreamer creates a new Streamer.
//
// txn must be a LeafTxn.
//
// limitBytes determines the maximum amount of memory this Streamer is allowed
// to use (i.e. it'll be used lazily, as needed). The more memory it has, the
// higher its internal concurrency and throughput.
//
// acc should be bound to an unlimited memory monitor, and the Streamer itself
// is responsible for staying under the limitBytes.
//
// The Streamer takes ownership of the memory account, and the caller is allowed
// to interact with the account only after canceling the Streamer (because
// memory accounts are not thread-safe).
func NewStreamer(
distSender *kvcoord.DistSender,
stopper *stop.Stopper,
txn *kv.Txn,
st *cluster.Settings,
lockWaitPolicy lock.WaitPolicy,
limitBytes int64,
acc *mon.BoundAccount,
) *Streamer {
if txn.Type() != kv.LeafTxn {
panic(errors.AssertionFailedf("RootTxn is given to the Streamer"))
}
s := &Streamer{
distSender: distSender,
stopper: stopper,
budget: newBudget(acc, limitBytes),
}
s.coordinator = workerCoordinator{
s: s,
txn: txn,
lockWaitPolicy: lockWaitPolicy,
requestAdmissionHeader: txn.AdmissionHeader(),
responseAdmissionQ: txn.DB().SQLKVResponseAdmissionQ,
}
// TODO(yuzefovich): consider lazily allocating this IntPool only when
// enqueued requests span multiple batches.
s.coordinator.asyncSem = quotapool.NewIntPool(
"single Streamer async concurrency",
uint64(streamerConcurrencyLimit.Get(&st.SV)),
)
return s
}
// Init initializes the Streamer.
//
// OperationMode controls the order in which results are delivered to the
// client. When possible, prefer OutOfOrder mode.
//
// Hints can be used to hint the aggressiveness of the caching policy. In
// particular, it can be used to disable caching when the client knows that all
// looked-up keys are unique (e.g. in the case of an index-join).
//
// maxKeysPerRow indicates the maximum number of KV pairs that comprise a single
// SQL row (i.e. the number of column families in the index being scanned).
//
// In InOrder mode, diskBuffer argument must be non-nil.
func (s *Streamer) Init(
mode OperationMode, hints Hints, maxKeysPerRow int, diskBuffer ResultDiskBuffer,
) {
s.mode = mode
if mode == OutOfOrder {
s.requestsToServe = newOutOfOrderRequestsProvider()
s.results = newOutOfOrderResultsBuffer(s.budget)
} else {
s.requestsToServe = newInOrderRequestsProvider()
s.results = newInOrderResultsBuffer(s.budget, diskBuffer, hints.SingleRowLookup)
}
if !hints.UniqueRequests {
panic(errors.AssertionFailedf("only unique requests are currently supported"))
}
s.hints = hints
s.maxKeysPerRow = int32(maxKeysPerRow)
}
// Enqueue dispatches multiple requests for execution. Results are delivered
// through the GetResults call. If enqueueKeys is not nil, it needs to contain
// one ID for each request; responses will reference that ID so that the client
// can associate them to the requests. If enqueueKeys is nil, then the responses
// will reference the ordinals of the corresponding requests among reqs.
//
// Multiple requests can specify the same key. In this case, their respective
// responses will also reference the same key. This is useful, for example, for
// "range-based lookup joins" where multiple spans are read in the context of
// the same input-side row (see multiSpanGenerator implementation of
// rowexec.joinReaderSpanGenerator interface for more details).
//
// The Streamer takes over the given requests, will perform the memory
// accounting against its budget and might modify the requests in place.
//
// In InOrder operation mode, responses will be delivered in reqs order.
//
// It is the caller's responsibility to ensure that the memory footprint of reqs
// (i.e. roachpb.Spans inside of the requests) is reasonable. Enqueue will
// return an error if that footprint exceeds the Streamer's limitBytes. The
// exception is made only when a single request is enqueued in order to allow
// the caller to proceed when the key to lookup is arbitrarily large. As a rule
// of thumb though, the footprint of reqs should be on the order of MBs, and not
// tens of MBs.
//
// Currently, enqueuing new requests while there are still requests in progress
// from the previous invocation is prohibited.
// TODO(yuzefovich): lift this restriction and introduce the pipelining.
func (s *Streamer) Enqueue(
ctx context.Context, reqs []roachpb.RequestUnion, enqueueKeys []int,
) (retErr error) {
if !s.coordinatorStarted {
var coordinatorCtx context.Context
coordinatorCtx, s.coordinatorCtxCancel = s.stopper.WithCancelOnQuiesce(ctx)
s.waitGroup.Add(1)
if err := s.stopper.RunAsyncTaskEx(
coordinatorCtx,
stop.TaskOpts{
TaskName: "streamer-coordinator",
SpanOpt: stop.ChildSpan,
},
s.coordinator.mainLoop,
); err != nil {
// The new goroutine wasn't spun up, so mainLoop won't get executed
// and we have to decrement the wait group ourselves.
s.waitGroup.Done()
return err
}
s.coordinatorStarted = true
}
defer func() {
// Set the error (if present) so that mainLoop of the worker coordinator
// exits as soon as possible, without issuing any requests.
if retErr != nil {
s.results.setError(retErr)
}
}()
if enqueueKeys != nil && len(enqueueKeys) != len(reqs) {
return errors.AssertionFailedf("invalid enqueueKeys: len(reqs) = %d, len(enqueueKeys) = %d", len(reqs), len(enqueueKeys))
}
s.enqueueKeys = enqueueKeys
if err := s.results.init(ctx, len(reqs)); err != nil {
return err
}
// The minimal key range encompassing all requests contained within.
// Local addressing has already been resolved.
rs, err := keys.Range(reqs)
if err != nil {
return err
}
// Divide the given requests into single-range batches that are added to
// requestsToServe, and the worker coordinator will then pick those batches
// up to execute asynchronously.
var totalReqsMemUsage int64
// Use a local variable for requestsToServe rather than adding them to the
// requestsProvider right away. This is needed in order for the worker
// coordinator to not pick up any work until we account for
// totalReqsMemUsage.
var requestsToServe []singleRangeBatch
seekKey := rs.Key
const scanDir = kvcoord.Ascending
ri := kvcoord.MakeRangeIterator(s.distSender)
ri.Seek(ctx, seekKey, scanDir)
if !ri.Valid() {
return ri.Error()
}
firstScanRequest := true
streamerLocked := false
defer func() {
if streamerLocked {
s.mu.Unlock()
}
}()
for ; ri.Valid(); ri.Seek(ctx, seekKey, scanDir) {
// Truncate the request span to the current range.
singleRangeSpan, err := rs.Intersect(ri.Token().Desc())
if err != nil {
return err
}
// Find all requests that touch the current range.
singleRangeReqs, positions, err := kvcoord.Truncate(reqs, singleRangeSpan)
if err != nil {
return err
}
var subRequestIdx []int
if !s.hints.SingleRowLookup {
for i, pos := range positions {
if _, isScan := reqs[pos].GetInner().(*roachpb.ScanRequest); isScan {
if firstScanRequest {
// We have some ScanRequests, and each might touch
// multiple ranges, so we have to set up
// numRangesPerScanRequest.
streamerLocked = true
s.mu.Lock()
if cap(s.mu.numRangesPerScanRequest) < len(reqs) {
s.mu.numRangesPerScanRequest = make([]int, len(reqs))
} else {
// We can reuse numRangesPerScanRequest allocated on
// the previous call to Enqueue after we zero it
// out.
s.mu.numRangesPerScanRequest = s.mu.numRangesPerScanRequest[:len(reqs)]
for n := 0; n < len(s.mu.numRangesPerScanRequest); {
n += copy(s.mu.numRangesPerScanRequest[n:], zeroIntSlice)
}
}
}
if s.mode == InOrder {
if subRequestIdx == nil {
subRequestIdx = make([]int, len(singleRangeReqs))
}
subRequestIdx[i] = s.mu.numRangesPerScanRequest[pos]
}
s.mu.numRangesPerScanRequest[pos]++
firstScanRequest = false
}
}
}
// TODO(yuzefovich): perform the de-duplication here.
//if !s.hints.UniqueRequests {
//}
r := singleRangeBatch{
reqs: singleRangeReqs,
positions: positions,
subRequestIdx: subRequestIdx,
reqsReservedBytes: requestsMemUsage(singleRangeReqs),
}
totalReqsMemUsage += r.reqsReservedBytes
if s.mode == OutOfOrder {
// Sort all single-range requests to be in the key order.
// TODO(yuzefovich): we should be able to sort not head-of-the-line
// request in the InOrder mode too; however, there would be
// complications whenever a request (either original or with
// ResumeSpans) is put back because in such a scenario any request
// can become head-of-the-line in the future. We probably will need
// to introduce a way to "restore" the original order within
// singleRangeBatch if it is sorted and issued with headOfLine=true.
sort.Sort(&r)
}
requestsToServe = append(requestsToServe, r)
// Determine next seek key, taking potentially sparse requests into
// consideration.
//
// In next iteration, query next range.
// It's important that we use the EndKey of the current descriptor
// as opposed to the StartKey of the next one: if the former is stale,
// it's possible that the next range has since merged the subsequent
// one, and unless both descriptors are stale, the next descriptor's
// StartKey would move us to the beginning of the current range,
// resulting in a duplicate scan.
seekKey, err = kvcoord.Next(reqs, ri.Desc().EndKey)
rs.Key = seekKey
if err != nil {
return err
}
}
if streamerLocked {
// Per the contract of the budget's mutex (which must be acquired first,
// before the Streamer's mutex), we cannot hold the mutex of s when
// consuming below, so we have to unlock it.
s.mu.Unlock()
streamerLocked = false
}
// We allow the budget to go into debt iff a single request was enqueued.
// This is needed to support the case of arbitrarily large keys - the caller
// is expected to produce requests with such cases one at a time.
allowDebt := len(reqs) == 1
if err = s.budget.consume(ctx, totalReqsMemUsage, allowDebt); err != nil {
return err
}
// Memory reservation was approved, so the requests are good to go.
if debug {
fmt.Printf("enqueuing %s to serve\n", reqsToString(requestsToServe))
}
s.requestsToServe.enqueue(requestsToServe)
return nil
}
// GetResults blocks until at least one result is available. If the operation
// mode is OutOfOrder, any result will do, and the caller is expected to examine
// Result.EnqueueKeysSatisfied to understand which request the result
// corresponds to. For InOrder, only head-of-line results will do. Zero-length
// result slice is returned once all enqueued requests have been responded to.
func (s *Streamer) GetResults(ctx context.Context) ([]Result, error) {
for {
results, allComplete, err := s.results.get(ctx)
if len(results) > 0 || allComplete || err != nil {
if debug {
if len(results) > 0 {
printSubRequestIdx := s.mode == InOrder && !s.hints.SingleRowLookup
fmt.Printf("returning %s to the client\n", resultsToString(results, printSubRequestIdx))
} else {
suffix := "all requests have been responded to"
if !allComplete {
suffix = fmt.Sprintf("%v", err)
}
fmt.Printf("returning no results to the client because %s\n", suffix)
}
}
return results, err
}
if debug {
fmt.Println("client blocking to wait for results")
}
s.results.wait()
// Check whether the Streamer has been canceled or closed while we were
// waiting for the results.
if err = ctx.Err(); err != nil {
s.results.setError(err)
return nil, err
}
}
}
// Close cancels all in-flight operations and releases all of the resources of
// the Streamer. It blocks until all goroutines created by the Streamer exit. No
// other calls on s are allowed after this.
func (s *Streamer) Close(ctx context.Context) {
if s.coordinatorStarted {
s.coordinatorCtxCancel()
s.mu.Lock()
s.mu.done = true
s.mu.Unlock()
s.requestsToServe.close()
s.results.close(ctx)
// Unblock the coordinator in case it is waiting for the budget.
s.budget.mu.waitForBudget.Signal()
}
s.waitGroup.Wait()
*s = Streamer{}
}
// getNumRequestsInProgress returns the number of requests that are currently
// "in progress" - already issued requests that are in flight combined with the
// number of unreleased results. This method should be called without holding
// the lock of s.
func (s *Streamer) getNumRequestsInProgress() int {
s.mu.Lock()
defer s.mu.Unlock()
return s.mu.numRequestsInFlight + s.results.numUnreleased()
}
// signalBudgetIfNoRequestsInProgressLocked checks whether there are no requests
// in progress and signals the budget's condition variable if so.
//
// We have to explicitly signal the condition variable to make sure that if the
// budget doesn't get out of debt, the worker coordinator doesn't wait for any
// other request to be completed / other result be released.
//
// The mutex of s must be held.
func (s *Streamer) signalBudgetIfNoRequestsInProgressLocked() {
s.mu.AssertHeld()
if s.mu.numRequestsInFlight == 0 && s.results.numUnreleased() == 0 {
s.budget.mu.waitForBudget.Signal()
}
}
// adjustNumRequestsInFlight updates the number of requests that are currently
// in flight. This method should be called without holding the lock of s.
func (s *Streamer) adjustNumRequestsInFlight(delta int) {
s.mu.Lock()
defer s.mu.Unlock()
s.mu.numRequestsInFlight += delta
s.signalBudgetIfNoRequestsInProgressLocked()
}
type workerCoordinator struct {
s *Streamer
txn *kv.Txn
lockWaitPolicy lock.WaitPolicy
asyncSem *quotapool.IntPool
// For request and response admission control.
requestAdmissionHeader roachpb.AdmissionHeader
responseAdmissionQ *admission.WorkQueue
}
// mainLoop runs throughout the lifetime of the Streamer (from the first Enqueue
// call until Close) and routes the single-range batches for asynchronous
// execution. This function is dividing up the Streamer's budget for each of
// those batches and won't start executing the batches if the available budget
// is insufficient. The function exits when an error is encountered by one of
// the asynchronous requests.
func (w *workerCoordinator) mainLoop(ctx context.Context) {
defer w.s.waitGroup.Done()
for {
if err := w.waitForRequests(ctx); err != nil {
w.s.results.setError(err)
return
}
var atLeastBytes int64
// The higher the value of priority is, the lower the actual priority of
// spilling. Use the maximum value by default.
spillingPriority := math.MaxInt64
w.s.requestsToServe.Lock()
if !w.s.requestsToServe.emptyLocked() {
// If we already have minTargetBytes set on the first request to be
// issued, then use that.
atLeastBytes = w.s.requestsToServe.firstLocked().minTargetBytes
// The first request has the highest urgency among all current
// requests to serve, so we use its priority to spill everything
// with less urgency when necessary to free up the budget.
spillingPriority = w.s.requestsToServe.firstLocked().priority()
}
w.s.requestsToServe.Unlock()
avgResponseSize, shouldExit := w.getAvgResponseSize()
if shouldExit {
return
}
if atLeastBytes == 0 {
atLeastBytes = avgResponseSize
}
shouldExit = w.waitUntilEnoughBudget(ctx, atLeastBytes, spillingPriority)
if shouldExit {
return
}
// Now check how many requests we can issue.
maxNumRequestsToIssue, shouldExit := w.getMaxNumRequestsToIssue(ctx)
if shouldExit {
return
}
err := w.issueRequestsForAsyncProcessing(ctx, maxNumRequestsToIssue, avgResponseSize)
if err != nil {
w.s.results.setError(err)
return
}
}
}
// waitForRequests blocks until there is at least one request to be served.
func (w *workerCoordinator) waitForRequests(ctx context.Context) error {
w.s.requestsToServe.Lock()
defer w.s.requestsToServe.Unlock()
if w.s.requestsToServe.emptyLocked() {
w.s.requestsToServe.waitLocked()
// Check if the Streamer has been canceled or closed while we were
// waiting.
if ctx.Err() != nil {
return ctx.Err()
}
w.s.mu.Lock()
shouldExit := w.s.results.error() != nil || w.s.mu.done
w.s.mu.Unlock()
if shouldExit {
return nil
}
if buildutil.CrdbTestBuild {
if w.s.requestsToServe.emptyLocked() {
panic(errors.AssertionFailedf("unexpectedly zero requests to serve after waiting "))
}
}
}
return nil
}
func (w *workerCoordinator) getAvgResponseSize() (avgResponseSize int64, shouldExit bool) {
w.s.mu.Lock()
defer w.s.mu.Unlock()
avgResponseSize = w.s.mu.avgResponseEstimator.getAvgResponseSize()
shouldExit = w.s.results.error() != nil || w.s.mu.done
return avgResponseSize, shouldExit
}
// waitUntilEnoughBudget waits until atLeastBytes bytes is available in the
// budget.
//
// A boolean that indicates whether the coordinator should exit is returned.
func (w *workerCoordinator) waitUntilEnoughBudget(
ctx context.Context, atLeastBytes int64, spillingPriority int,
) (shouldExit bool) {
w.s.budget.mu.Lock()
defer w.s.budget.mu.Unlock()
for w.s.budget.limitBytes-w.s.budget.mu.acc.Used() < atLeastBytes {
// There isn't enough budget at the moment.
//
// First, ask the results buffer to spill some results to disk in order
// to free up budget.
if ok, err := w.s.results.spill(
ctx, atLeastBytes-(w.s.budget.limitBytes-w.s.budget.mu.acc.Used()), spillingPriority,
); err != nil {
w.s.results.setError(err)
return true
} else if ok {
// The spilling was successful.
return false
}
// The spilling didn't succeed, so we need to wait for budget to open
// up.
// Check whether there are any requests in progress.
if w.s.getNumRequestsInProgress() == 0 {
// We have a degenerate case when a single row is expected to exceed
// the budget.
return false
}
if debug {
fmt.Printf(
"waiting for budget to free up: atLeastBytes %d, available %d\n",
atLeastBytes, w.s.budget.limitBytes-w.s.budget.mu.acc.Used(),
)
}
// We have to wait for some budget.release() calls.
w.s.budget.mu.waitForBudget.Wait()
// Check if the Streamer has been canceled or closed while we were
// waiting.
if ctx.Err() != nil {
w.s.results.setError(ctx.Err())
return true
}
}
return false
}
// getMaxNumRequestsToIssue returns the maximum number of new async requests the
// worker coordinator can issue without exceeding streamerConcurrencyLimit
// limit. It blocks until at least one request can be issued.
//
// This behavior is needed to ensure that the creation of a new async task in
// performRequestAsync doesn't block on w.asyncSem. If it did block, then we
// could get into a deadlock because the main goroutine of the worker
// coordinator is holding the budget's mutex waiting for quota to open up while
// all asynchronous requests that could free up that quota would block on
// attempting to acquire the budget's mutex.
//
// A boolean that indicates whether the coordinator should exit is also
// returned.
func (w *workerCoordinator) getMaxNumRequestsToIssue(ctx context.Context) (_ int, shouldExit bool) {
// Since the worker coordinator goroutine is the only one acquiring quota
// from the semaphore, ApproximateQuota returns the precise quota at the
// moment.
q := w.asyncSem.ApproximateQuota()
if q > 0 {
return int(q), false
}
// The whole quota is currently used up, so we blockingly acquire a quota of
// 1.
alloc, err := w.asyncSem.Acquire(ctx, 1)
if err != nil {
w.s.results.setError(err)
return 0, true
}
alloc.Release()
return 1, false
}
// issueRequestsForAsyncProcessing iterates over the single-range requests
// (supplied by the requestsProvider) and issues them to be served
// asynchronously while there is enough budget available to receive the
// responses. Once the budget is exhausted, no new requests are issued, the only
// exception is made for the case when there are no requests in progress (both
// requests in flight as well as unreleased results), and in that scenario, a
// single request will be issued.
//
// maxNumRequestsToIssue specifies the maximum number of requests that can be
// issued as part of this call. The caller guarantees that w.asyncSem has at
// least that much quota available.
func (w *workerCoordinator) issueRequestsForAsyncProcessing(
ctx context.Context, maxNumRequestsToIssue int, avgResponseSize int64,
) error {
w.s.requestsToServe.Lock()
defer w.s.requestsToServe.Unlock()
w.s.budget.mu.Lock()
defer w.s.budget.mu.Unlock()
headOfLine := w.s.getNumRequestsInProgress() == 0
var budgetIsExhausted bool
for !w.s.requestsToServe.emptyLocked() && maxNumRequestsToIssue > 0 && !budgetIsExhausted {
singleRangeReqs := w.s.requestsToServe.firstLocked()
availableBudget := w.s.budget.limitBytes - w.s.budget.mu.acc.Used()
// minAcceptableBudget is the minimum TargetBytes limit with which it
// makes sense to issue this request (if we issue the request with
// smaller limit, then it's very likely to come back with an empty
// response).
minAcceptableBudget := singleRangeReqs.minTargetBytes
if minAcceptableBudget == 0 {
minAcceptableBudget = avgResponseSize
}
if availableBudget < minAcceptableBudget {
if !headOfLine {
// We don't have enough budget available to serve this request,
// and there are other requests in progress, so we'll wait for
// some of them to finish.
return nil
}
budgetIsExhausted = true
if availableBudget < 1 {
// The budget is already in debt, and we have no requests in
// flight. This occurs when we have very large roachpb.Span in
// the request. In such a case, we still want to make progress
// by giving the smallest TargetBytes possible while asking the
// KV layer to not return an empty response.
availableBudget = 1
}
}
// Calculate what TargetBytes limit to use for the BatchRequest that
// will be issued based on singleRangeReqs. We use the estimate to guess
// how much memory the response will need, and we reserve this
// estimation up front.
//
// Note that TargetBytes will be a strict limit on the response size
// (except in a degenerate case for head-of-the-line request that will
// get a very large single row in response which will exceed this
// limit).
targetBytes := int64(len(singleRangeReqs.reqs)) * avgResponseSize
// Make sure that targetBytes is sufficient to receive non-empty
// response. Our estimate might be an under-estimate when responses vary
// significantly in size.
if targetBytes < singleRangeReqs.minTargetBytes {
targetBytes = singleRangeReqs.minTargetBytes
}
if targetBytes > availableBudget {
// The estimate tells us that we don't have enough budget to receive
// the full response; however, in order to utilize the available
// budget fully, we can still issue this request with the truncated
// TargetBytes value hoping to receive a partial response.
targetBytes = availableBudget
}
if err := w.s.budget.consumeLocked(ctx, targetBytes, headOfLine /* allowDebt */); err != nil {
// This error cannot be because of the budget going into debt. If
// headOfLine is true, then we're allowing debt; otherwise, we have
// truncated targetBytes above to not exceed availableBudget, and
// we're holding the budget's mutex. Thus, the error indicates that
// the root memory pool has been exhausted.
if !headOfLine {
// There are some requests in progress, so we'll let them
// finish / be released.
//
// This is opportunistic behavior where we're hoping that once
// other requests are fully processed (i.e. the corresponding
// results are Release()'d), we'll be able to make progress on
// this request too, without exceeding the root memory pool.
//
// We're not really concerned about pushing the node towards the
// OOM situation because we're still staying within the root
// memory pool limit (which should have some safety gap with the
// available RAM). Furthermore, if other queries are consuming
// all of the root memory pool limit, then the head-of-the-line
// request will notice it and will exit accordingly.
return nil
}
// We don't have any requests in progress, so we'll exit to be safe
// (in order not to OOM the node). Most likely this occurs when
// there are concurrent memory-intensive queries which this Streamer
// has no control over.
//
// We could have issued this head-of-the-line request with lower
// targetBytes value (unless it is already 1), but the fact that the
// root memory pool is exhausted indicates that the node might be
// overloaded already, so it seems better to not ask it to receive
// any more responses at the moment.
return err
}
if debug {
fmt.Printf(
"issuing an async request for positions %v, targetBytes=%d, headOfLine=%t\n",
singleRangeReqs.positions, targetBytes, headOfLine,
)
}
w.performRequestAsync(ctx, singleRangeReqs, targetBytes, headOfLine)
w.s.requestsToServe.removeFirstLocked()
maxNumRequestsToIssue--
headOfLine = false
}
return nil
}
// budgetMuAlreadyLocked must be true if the caller is currently holding the
// budget's mutex.
func (w *workerCoordinator) asyncRequestCleanup(budgetMuAlreadyLocked bool) {
if !budgetMuAlreadyLocked {
// Since we're decrementing the number of requests in flight, we want to
// make sure that the budget's mutex is locked, and it currently isn't.
// This is needed so that if we signal the budget in
// adjustNumRequestsInFlight, the worker coordinator doesn't miss the
// signal.
//
// If we don't do this, then it is possible for the worker coordinator
// to be blocked forever in waitUntilEnoughBudget. Namely, the following
// sequence of events is possible:
// 1. the worker coordinator checks that there are some requests in
// progress, then it goes to sleep before waiting on waitForBudget