-
Notifications
You must be signed in to change notification settings - Fork 3.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ccl/sqlproxyccl: improve connection logging behavior #134613
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -174,13 +174,10 @@ type proxyHandler struct { | |
cancelInfoMap *cancelInfoMap | ||
} | ||
|
||
const throttledErrorHint string = `Connection throttling is triggered by repeated authentication failure. Make | ||
sure the username and password are correct. | ||
` | ||
const throttledErrorHint string = `Connection throttling is triggered by repeated authentication failure. Make sure the username and password are correct.` | ||
|
||
var authThrottledError = errors.WithHint( | ||
withCode(errors.New( | ||
"too many failed authentication attempts"), codeProxyRefusedConnection), | ||
withCode(errors.New("too many failed authentication attempts"), codeProxyRefusedConnection), | ||
throttledErrorHint) | ||
|
||
// newProxyHandler will create a new proxy handler with configuration based on | ||
|
@@ -372,34 +369,44 @@ func (handler *proxyHandler) handle( | |
|
||
// NOTE: Errors returned from this function are user-facing errors so we | ||
// should be careful with the details that we want to expose. | ||
// | ||
// TODO(jaylim-crl): Update this such that we return both the internal and | ||
// user-facing errors from clusterNameAndTenantFromParams. Only the internal | ||
// error should be returned to the caller. | ||
backendStartupMsg, clusterName, tenID, err := clusterNameAndTenantFromParams(ctx, fe, handler.metrics) | ||
if err != nil { | ||
clientErr := withCode(err, codeParamsRoutingFailed) | ||
log.Errorf(ctx, "unable to extract cluster name and tenant id: %s", err.Error()) | ||
updateMetricsAndSendErrToClient(clientErr, fe.Conn, handler.metrics) | ||
return clientErr | ||
return errors.Wrap(err, "extracting cluster identifier") | ||
} | ||
|
||
// Validate the incoming connection and ensure that the cluster name | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did this block of code move? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like to ensure that we only add logtags after validating the connection. If we left the validation in its original location, we may end up having a log line with a mismatched cluster name and tenant ID, leading to confusions during debugging. See the block that comes after the validation: // Only add logtags after validating the connection. If the connection isn't
// validated, clusterName may not match the tenant ID, and this could cause
// confusion when analyzing logs.
ctx = logtags.AddTag(ctx, "cluster", clusterName)
ctx = logtags.AddTag(ctx, "tenant", tenID) The proxy uses the tenant ID to spin up a new SQL server. Back then, there were concerns that users could iterate through |
||
// matches the tenant's. This avoids malicious actors from attempting to | ||
// connect to the cluster using just the tenant ID. | ||
if err := handler.validateConnection(ctx, tenID, clusterName); err != nil { | ||
// We do not need to log here as validateConnection already logs. | ||
updateMetricsAndSendErrToClient(err, fe.Conn, handler.metrics) | ||
return err | ||
} | ||
|
||
// Only add logtags after validating the connection. If the connection isn't | ||
// validated, clusterName may not match the tenant ID, and this could cause | ||
// confusion when analyzing logs. | ||
ctx = logtags.AddTag(ctx, "cluster", clusterName) | ||
ctx = logtags.AddTag(ctx, "tenant", tenID) | ||
|
||
// Add request tags so that callers can provide a better context for errors. | ||
reqTags := requestTagsFromContext(ctx) | ||
reqTags["cluster"] = clusterName | ||
reqTags["tenant"] = tenID | ||
|
||
// Use an empty string as the default port as we only care about the | ||
// correctly parsing the IP address here. | ||
ipAddr, _, err := addr.SplitHostPort(fe.Conn.RemoteAddr().String(), "") | ||
if err != nil { | ||
clientErr := withCode(errors.New("unexpected connection address"), codeParamsRoutingFailed) | ||
log.Errorf(ctx, "could not parse address: %v", err.Error()) | ||
updateMetricsAndSendErrToClient(clientErr, fe.Conn, handler.metrics) | ||
return clientErr | ||
} | ||
|
||
// Validate the incoming connection and ensure that the cluster name | ||
// matches the tenant's. This avoids malicious actors from attempting to | ||
// connect to the cluster using just the tenant ID. | ||
if err := handler.validateConnection(ctx, tenID, clusterName); err != nil { | ||
// We do not need to log here as validateConnection already logs. | ||
updateMetricsAndSendErrToClient(err, fe.Conn, handler.metrics) | ||
return err | ||
return errors.Wrap(err, "parsing remote address") | ||
} | ||
|
||
errConnection := make(chan error, 1) | ||
|
@@ -426,20 +433,27 @@ func (handler *proxyHandler) handle( | |
// with a deleting tenant. This case is rare, and we'll just return a | ||
// "connection refused" error. The next time they connect, they will | ||
// get a "not found" error. | ||
log.Errorf(ctx, "connection blocked by access control list: %v", err) | ||
err = withCode(errors.New("connection refused"), codeProxyRefusedConnection) | ||
updateMetricsAndSendErrToClient(err, fe.Conn, handler.metrics) | ||
return err | ||
// | ||
// TODO(jaylim-crl): We can enrich this error with the proper reason on | ||
// why they were refused (e.g. IP allowlist, or private endpoints). | ||
clientErr := withCode(errors.New("connection refused"), codeProxyRefusedConnection) | ||
updateMetricsAndSendErrToClient(clientErr, fe.Conn, handler.metrics) | ||
return errors.Wrap(err, "connection blocked by access control list") | ||
} | ||
defer removeListener() | ||
|
||
throttleTags := throttler.ConnectionTags{IP: ipAddr, TenantID: tenID.String()} | ||
throttleTime, err := handler.throttleService.LoginCheck(throttleTags) | ||
throttleTime, err := handler.throttleService.LoginCheck(ctx, throttleTags) | ||
if err != nil { | ||
log.Errorf(ctx, "throttler refused connection: %v", err.Error()) | ||
err = authThrottledError | ||
updateMetricsAndSendErrToClient(err, fe.Conn, handler.metrics) | ||
return err | ||
clientErr := authThrottledError | ||
updateMetricsAndSendErrToClient(clientErr, fe.Conn, handler.metrics) | ||
// The throttle service is used to rate limit invalid login attempts | ||
// from IP addresses, and it is commonly prone to generating excessive | ||
// traffic in practice. Due to that, we'll return a nil here to prevent | ||
// callers from logging this request. However, LoginCheck itself | ||
// periodically logs an error when such requests are rate limited, so | ||
// we won't miss any signals by doing this. | ||
return nil //nolint:returnerrcheck | ||
} | ||
|
||
connector := &connector{ | ||
|
@@ -471,14 +485,15 @@ func (handler *proxyHandler) handle( | |
if err := handler.throttleService.ReportAttempt( | ||
ctx, throttleTags, throttleTime, status, | ||
); err != nil { | ||
// We have to log here because errors returned by this closure | ||
// will be sent to the client. | ||
log.Errorf(ctx, "throttler refused connection after authentication: %v", err.Error()) | ||
return authThrottledError | ||
} | ||
return nil | ||
}, | ||
) | ||
if err != nil { | ||
log.Errorf(ctx, "could not connect to cluster: %v", err.Error()) | ||
if sentToClient { | ||
handler.metrics.updateForError(err) | ||
} else { | ||
|
@@ -490,16 +505,20 @@ func (handler *proxyHandler) handle( | |
|
||
// Update the cancel info. | ||
handler.cancelInfoMap.addCancelInfo(connector.CancelInfo.proxySecretID(), connector.CancelInfo) | ||
defer func() { | ||
handler.cancelInfoMap.deleteCancelInfo(connector.CancelInfo.proxySecretID()) | ||
}() | ||
|
||
// Record the connection success and how long it took. | ||
handler.metrics.ConnectionLatency.RecordValue(timeutil.Since(connReceivedTime).Nanoseconds()) | ||
handler.metrics.SuccessfulConnCount.Inc(1) | ||
|
||
log.Infof(ctx, "new connection") | ||
// TOOD(jaylim-crl): Consider replacing this with a metric that measures | ||
// connection lifetime. We might also be able to fetch these by analyzing | ||
// the session logs. | ||
connBegin := timeutil.Now() | ||
defer func() { | ||
log.Infof(ctx, "closing after %.2fs", timeutil.Since(connBegin).Seconds()) | ||
handler.cancelInfoMap.deleteCancelInfo(connector.CancelInfo.proxySecretID()) | ||
}() | ||
|
||
// Wrap the client connection with an error annotater. WARNING: The TLS | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -268,12 +268,38 @@ func (s *Server) serve(ctx context.Context, ln net.Listener, requireProxyProtoco | |
|
||
err = s.Stopper.RunAsyncTask(ctx, "proxy-con-serve", func(ctx context.Context) { | ||
defer func() { _ = conn.Close() }() | ||
|
||
s.metrics.CurConnCount.Inc(1) | ||
defer s.metrics.CurConnCount.Dec(1) | ||
remoteAddr := conn.RemoteAddr() | ||
ctxWithTag := logtags.AddTag(ctx, "client", log.SafeOperational(remoteAddr)) | ||
if err := s.handler.handle(ctxWithTag, conn, requireProxyProtocol); err != nil { | ||
log.Infof(ctxWithTag, "connection error: %v", err) | ||
|
||
ctx = logtags.AddTag(ctx, "client", log.SafeOperational(conn.RemoteAddr())) | ||
|
||
// Use a map to collect request-specific information at higher | ||
// layers of the stack. This helps ensure that all relevant | ||
// information is captured, providing better context for the error | ||
// logs. | ||
// | ||
// We could improve this by creating a custom context.Context object | ||
// to track all data related to the request (including migration | ||
// history). For now, this approach is adequate. | ||
reqTags := make(map[string]interface{}) | ||
ctx = contextWithRequestTags(ctx, reqTags) | ||
|
||
err := s.handler.handle(ctx, conn, requireProxyProtocol) | ||
if err != nil && !errors.Is(err, context.Canceled) { | ||
for key, value := range reqTags { | ||
ctx = logtags.AddTag(ctx, key, value) | ||
} | ||
// log.Infof automatically prints hints (one per line) that are | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we also change the behavior of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
// associated with the input error object. This causes | ||
// unnecessary log spam, especially when proxy hints are meant | ||
// for the user. We will intentionally create a new error object | ||
// without the hints just for logging purposes. | ||
// | ||
// TODO(jaylim-crl): Ensure that handle does not return user | ||
// facing errors (i.e. one that contains hints). | ||
errWithoutHints := errors.Newf("%s", err.Error()) // nolint:errwrap | ||
log.Infof(ctx, "connection closed: %v", errWithoutHints) | ||
} | ||
}) | ||
if err != nil { | ||
|
@@ -322,3 +348,23 @@ func (s *Server) AwaitNoConnections(ctx context.Context) <-chan struct{} { | |
|
||
return c | ||
} | ||
|
||
// requestTagsContextKey is the type of a context.Value key used to carry the | ||
// request tags map in a context.Context object. | ||
type requestTagsContextKey struct{} | ||
|
||
// contextWithRequestTags returns a context annotated with the provided request | ||
// tags map. Use requestTagsFromContext(ctx) to retrieve it back. | ||
func contextWithRequestTags(ctx context.Context, reqTags map[string]interface{}) context.Context { | ||
return context.WithValue(ctx, requestTagsContextKey{}, reqTags) | ||
} | ||
|
||
// requestTagsFromContext retrieves the request tags map stored in the context | ||
// via contextWithRequestTags. | ||
func requestTagsFromContext(ctx context.Context) map[string]interface{} { | ||
r := ctx.Value(requestTagsContextKey{}) | ||
if r == nil { | ||
return nil | ||
} | ||
return r.(map[string]interface{}) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,6 +72,8 @@ func NewLocalService(opts ...LocalOption) Service { | |
return s | ||
} | ||
|
||
var _ Service = (*localService)(nil) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does this line do? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This ensures that var _ Service = &localService{} We use this pattern in CC as well. |
||
|
||
func (s *localService) lockedGetThrottle(connection ConnectionTags) *throttle { | ||
l, ok := s.mu.throttleCache.Get(connection) | ||
if ok && l != nil { | ||
|
@@ -86,18 +88,26 @@ func (s *localService) lockedInsertThrottle(connection ConnectionTags) *throttle | |
return l | ||
} | ||
|
||
func (s *localService) LoginCheck(connection ConnectionTags) (time.Time, error) { | ||
// LoginCheck implements the Service interface. | ||
func (s *localService) LoginCheck( | ||
ctx context.Context, connection ConnectionTags, | ||
) (time.Time, error) { | ||
s.mu.Lock() | ||
defer s.mu.Unlock() | ||
|
||
now := s.clock() | ||
throttle := s.lockedGetThrottle(connection) | ||
if throttle != nil && throttle.isThrottled(now) { | ||
if throttle.everyLog.ShouldLog() { | ||
// ctx should include logtags about the connection. | ||
log.Error(ctx, "throttler refused connection due to too many failed authentication attempts") | ||
} | ||
Comment on lines
+101
to
+104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we didn't want the service to probe into the internals of the throttle struct (e.g. everyLog), this could be its own method on throttle (e.g. reportThrottled), but I don't feel strongly. |
||
return now, errRequestDenied | ||
} | ||
return now, nil | ||
} | ||
|
||
// ReportAttempt implements the Service interface. | ||
func (s *localService) ReportAttempt( | ||
ctx context.Context, connection ConnectionTags, throttleTime time.Time, status AttemptStatus, | ||
) error { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is log level 2 and how did you pick it? Are there corresponding constants like error/warn/info/etc.?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately, I don't think we have any rule of thumb around vmodule logging levels. See this internal Slack thread: https://cockroachlabs.slack.com/archives/C9TGBJB44/p1614100926026700. Each vmodule logging level seems to be specific to the file itself. I picked 2 somewhat arbitrarily; it's a middle value, and opens up opportunities for something less/more verbose as well.
For this case, if we wanted to display logs from the connection tracker, we would start the proxy with the following flag:
--vmodule=conn_tracker=2
(i.e.--vmodule=FILE=LEVEL,FILE2=LEVEL2,...
).