Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REP-5318 Retry on change stream failures. #52

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ require (
github.com/pkg/errors v0.9.1
github.com/rs/zerolog v1.28.0
github.com/samber/lo v1.47.0
github.com/samber/mo v1.13.0
github.com/stretchr/testify v1.8.0
github.com/urfave/cli v1.22.9
go.mongodb.org/mongo-driver v1.17.1
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/samber/lo v1.47.0 h1:z7RynLwP5nbyRscyvcD043DWYoOcYRv3mV8lBeqOCLc=
github.com/samber/lo v1.47.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU=
github.com/samber/mo v1.13.0 h1:LB1OwfJMju3a6FjghH+AIvzMG0ZPOzgTWj1qaHs1IQ4=
github.com/samber/mo v1.13.0/go.mod h1:BfkrCPuYzVG3ZljnZB783WIJIGk1mcZr9c9CPf8tAxs=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
Expand Down
5 changes: 3 additions & 2 deletions internal/util/error.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ import (
// `ErrorCode` newtype, but that requires a more invasive change to everything
// that uses error codes.
const (
LockFailed int = 107
SampleTooManyDuplicates int = 28799
LockFailed = 107
SampleTooManyDuplicates = 28799
CursorKilled = 237
)

//
Expand Down
178 changes: 126 additions & 52 deletions internal/verifier/change_stream.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@ import (
"time"

"github.com/10gen/migration-verifier/internal/keystring"
"github.com/10gen/migration-verifier/internal/retry"
"github.com/10gen/migration-verifier/internal/util"
"github.com/10gen/migration-verifier/option"
"github.com/pkg/errors"
"github.com/rs/zerolog"
"github.com/samber/mo"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/bson/primitive"
"go.mongodb.org/mongo-driver/mongo"
Expand Down Expand Up @@ -63,7 +67,7 @@ func (verifier *Verifier) HandleChangeStreamEvents(ctx context.Context, batch []
for i, changeEvent := range batch {
if changeEvent.ClusterTime != nil &&
(verifier.lastChangeEventTime == nil ||
verifier.lastChangeEventTime.Compare(*changeEvent.ClusterTime) < 0) {
verifier.lastChangeEventTime.Before(*changeEvent.ClusterTime)) {
verifier.lastChangeEventTime = changeEvent.ClusterTime
}
switch changeEvent.OpType {
Expand Down Expand Up @@ -135,14 +139,14 @@ func (verifier *Verifier) GetChangeStreamFilter() []bson.D {
// is unideal but shouldn’t impede correctness since post-writesOff events
// shouldn’t really happen anyway by definition.
func (verifier *Verifier) readAndHandleOneChangeEventBatch(
ctx context.Context,
sctx mongo.SessionContext,
cs *mongo.ChangeStream,
) error {
eventsRead := 0
var changeEventBatch []ParsedEvent

for hasEventInBatch := true; hasEventInBatch; hasEventInBatch = cs.RemainingBatchLength() > 0 {
gotEvent := cs.TryNext(ctx)
gotEvent := cs.TryNext(sctx)

if cs.Err() != nil {
return errors.Wrap(cs.Err(), "change stream iteration failed")
Expand All @@ -167,25 +171,31 @@ func (verifier *Verifier) readAndHandleOneChangeEventBatch(
return nil
}

err := verifier.HandleChangeStreamEvents(ctx, changeEventBatch)
// Update the change stream lag, which we’ll display in the logs.
lag, err := computeChangeStreamLag(sctx, cs)
if err == nil {
verifier.changeStreamLag.Store(option.Some(lag))
} else {
verifier.logger.Debug().Err(err).Msg("Failed to compute change stream lag.")
}

err = verifier.HandleChangeStreamEvents(sctx, changeEventBatch)
if err != nil {
return errors.Wrap(err, "failed to handle change events")
}

return nil
}

func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.ChangeStream) {
defer cs.Close(ctx)

func (verifier *Verifier) iterateChangeStream(sctx mongo.SessionContext, cs *mongo.ChangeStream) error {
var lastPersistedTime time.Time

persistResumeTokenIfNeeded := func() error {
if time.Since(lastPersistedTime) <= minChangeStreamPersistInterval {
return nil
}

err := verifier.persistChangeStreamResumeToken(ctx, cs)
err := verifier.persistChangeStreamResumeToken(sctx, cs)
if err == nil {
lastPersistedTime = time.Now()
}
Expand All @@ -200,11 +210,8 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
select {

// If the context is canceled, return immmediately.
case <-ctx.Done():
verifier.logger.Debug().
Err(ctx.Err()).
Msg("Change stream quitting.")
return
case <-sctx.Done():
return sctx.Err()

// If the changeStreamEnderChan has a message, the user has indicated that
// source writes are ended. This means we should exit rather than continue
Expand All @@ -222,11 +229,12 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
var curTs primitive.Timestamp
curTs, err = extractTimestampFromResumeToken(cs.ResumeToken())
if err != nil {
err = errors.Wrap(err, "failed to extract timestamp from change stream's resume token")
break
return errors.Wrap(err, "failed to extract timestamp from change stream's resume token")
}

if curTs.After(writesOffTs) {
// writesOffTs never refers to a real event,
// so we can stop once curTs >= writesOffTs.
if !curTs.Before(writesOffTs) {
verifier.logger.Debug().
Interface("currentTimestamp", curTs).
Interface("writesOffTimestamp", writesOffTs).
Expand All @@ -235,30 +243,22 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
break
}

err = verifier.readAndHandleOneChangeEventBatch(ctx, cs)
err = verifier.readAndHandleOneChangeEventBatch(sctx, cs)

if err != nil {
break
return err
}
}

default:
err = verifier.readAndHandleOneChangeEventBatch(ctx, cs)
err = verifier.readAndHandleOneChangeEventBatch(sctx, cs)

if err == nil {
err = persistResumeTokenIfNeeded()
}
}

if err != nil && !errors.Is(err, context.Canceled) {
verifier.logger.Debug().
Err(err).
Msg("Sending change stream error.")

verifier.changeStreamErrChan <- err

if !gotwritesOffTimestamp {
break
if err != nil {
return err
}
}

Expand All @@ -284,10 +284,13 @@ func (verifier *Verifier) iterateChangeStream(ctx context.Context, cs *mongo.Cha
}

infoLog.Msg("Change stream is done.")

return nil
}

// StartChangeStream starts the change stream.
func (verifier *Verifier) StartChangeStream(ctx context.Context) error {
func (verifier *Verifier) createChangeStream(
sctx mongo.SessionContext,
) (*mongo.ChangeStream, primitive.Timestamp, error) {
pipeline := verifier.GetChangeStreamFilter()
opts := options.ChangeStream().
SetMaxAwaitTime(1 * time.Second).
Expand All @@ -297,9 +300,9 @@ func (verifier *Verifier) StartChangeStream(ctx context.Context) error {
opts = opts.SetCustomPipeline(bson.M{"showExpandedEvents": true})
}

savedResumeToken, err := verifier.loadChangeStreamResumeToken(ctx)
savedResumeToken, err := verifier.loadChangeStreamResumeToken(sctx)
if err != nil {
return errors.Wrap(err, "failed to load persisted change stream resume token")
return nil, primitive.Timestamp{}, errors.Wrap(err, "failed to load persisted change stream resume token")
}

csStartLogEvent := verifier.logger.Info()
Expand All @@ -324,42 +327,99 @@ func (verifier *Verifier) StartChangeStream(ctx context.Context) error {
csStartLogEvent.Msg("Starting change stream from current source cluster time.")
}

sess, err := verifier.srcClient.StartSession()
if err != nil {
return errors.Wrap(err, "failed to start session")
}
sctx := mongo.NewSessionContext(ctx, sess)
srcChangeStream, err := verifier.srcClient.Watch(sctx, pipeline, opts)
if err != nil {
return errors.Wrap(err, "failed to open change stream")
return nil, primitive.Timestamp{}, errors.Wrap(err, "failed to open change stream")
}

err = verifier.persistChangeStreamResumeToken(ctx, srcChangeStream)
err = verifier.persistChangeStreamResumeToken(sctx, srcChangeStream)
if err != nil {
return err
return nil, primitive.Timestamp{}, err
}

csTimestamp, err := extractTimestampFromResumeToken(srcChangeStream.ResumeToken())
startTs, err := extractTimestampFromResumeToken(srcChangeStream.ResumeToken())
if err != nil {
return errors.Wrap(err, "failed to extract timestamp from change stream's resume token")
return nil, primitive.Timestamp{}, errors.Wrap(err, "failed to extract timestamp from change stream's resume token")
}

clusterTime, err := getClusterTimeFromSession(sess)
// With sharded clusters the resume token might lead the cluster time
// by 1 increment. In that case we need the actual cluster time;
// otherwise we will get errors.
clusterTime, err := getClusterTimeFromSession(sctx)
if err != nil {
return errors.Wrap(err, "failed to read cluster time from session")
return nil, primitive.Timestamp{}, errors.Wrap(err, "failed to read cluster time from session")
}

verifier.srcStartAtTs = &csTimestamp
if csTimestamp.After(clusterTime) {
verifier.srcStartAtTs = &clusterTime
if startTs.After(clusterTime) {
startTs = clusterTime
}

return srcChangeStream, startTs, nil
}

// StartChangeStream starts the change stream.
func (verifier *Verifier) StartChangeStream(ctx context.Context) error {
// Result seems a bit simpler than messing with 2 separate channels.
resultChan := make(chan mo.Result[primitive.Timestamp])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit confused. Could you clarify what result does this resultChan expect?


go func() {
retryer := retry.New(retry.DefaultDurationLimit)
retryer = retryer.WithErrorCodes(util.CursorKilled)

parentThreadWaiting := true

err := retryer.
RunForTransientErrorsOnly(
ctx,
verifier.logger,
func(i *retry.Info) error {
sess, err := verifier.srcClient.StartSession()
if err != nil {
return errors.Wrap(err, "failed to start change stream session")
}

sctx := mongo.NewSessionContext(ctx, sess)

srcChangeStream, startTs, err := verifier.createChangeStream(sctx)
if err != nil {
return err
}

defer srcChangeStream.Close(ctx)

if parentThreadWaiting {
resultChan <- mo.Ok(startTs)
close(resultChan)
parentThreadWaiting = false
}

return verifier.iterateChangeStream(sctx, srcChangeStream)
},
)

if err != nil {
if parentThreadWaiting {
resultChan <- mo.Err[primitive.Timestamp](err)
} else {
verifier.changeStreamErrChan <- err
close(verifier.changeStreamErrChan)
}
}
}()

result := <-resultChan

startTs, err := result.Get()
if err != nil {
return err
}

verifier.srcStartAtTs = &startTs

verifier.mux.Lock()
verifier.changeStreamRunning = true
verifier.mux.Unlock()

go verifier.iterateChangeStream(ctx, srcChangeStream)

return nil
}

Expand Down Expand Up @@ -388,12 +448,26 @@ func (verifier *Verifier) loadChangeStreamResumeToken(ctx context.Context) (bson
return token, err
}

func (verifier *Verifier) persistChangeStreamResumeToken(ctx context.Context, cs *mongo.ChangeStream) error {
func computeChangeStreamLag(
sctx mongo.SessionContext,
cs *mongo.ChangeStream,
) (time.Duration, error) {
token := cs.ResumeToken()
ts, err := extractTimestampFromResumeToken(token)
if err != nil {
return 0, errors.Wrap(err, "failed to extract timestamp from change stream resume token")
}

optime := sctx.OperationTime()
return time.Second * time.Duration(optime.T-ts.T), nil
}

func (verifier *Verifier) persistChangeStreamResumeToken(sctx mongo.SessionContext, cs *mongo.ChangeStream) error {
token := cs.ResumeToken()

coll := verifier.getChangeStreamMetadataCollection()
_, err := coll.ReplaceOne(
ctx,
sctx,
bson.D{{"_id", "resumeToken"}},
token,
options.Replace().SetUpsert(true),
Expand Down
Loading
Loading