obfuscate: account for bracketed identifiers as AS parameters (#474)

* obfuscate: account for bracketed identifiers as AS parameters * obfuscate: made tokenConsumer resettable and private
DataDog · Sep 26, 2018 · c200dbb · c200dbb
1 parent af98b3c
commit c200dbb
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 63 deletions.
diff --git a/obfuscate/obfuscate.go b/obfuscate/obfuscate.go
@@ -13,6 +13,7 @@ import (
 // concurrent use.
 type Obfuscator struct {
 	opts  *config.ObfuscationConfig
+	sql   *tokenConsumer
 	es    *jsonObfuscator // nil if disabled
 	mongo *jsonObfuscator // nil if disabled
 }
@@ -22,7 +23,10 @@ func NewObfuscator(cfg *config.ObfuscationConfig) *Obfuscator {
 	if cfg == nil {
 		cfg = new(config.ObfuscationConfig)
 	}
-	o := Obfuscator{opts: cfg}
+	o := Obfuscator{
+		opts: cfg,
+		sql:  newTokenConsumer(),
+	}
 	if cfg.ES.Enabled {
 		o.es = newJSONObfuscator(&cfg.ES)
 	}

diff --git a/obfuscate/sql.go b/obfuscate/sql.go
@@ -13,26 +13,43 @@ const (
 	sqlQuantizeError = "agent.parse.error"
 )
 
-// TokenFilter is a generic interface that a TokenConsumer expects. It defines
+// tokenFilter is a generic interface that a tokenConsumer expects. It defines
 // the Filter() function used to filter or replace given tokens.
 // A filter can be stateful and keep an internal state to apply the filter later;
 // this can be useful to prevent backtracking in some cases.
-type TokenFilter interface {
+type tokenFilter interface {
 	Filter(token, lastToken int, buffer []byte) (int, []byte)
 	Reset()
 }
 
-// DiscardFilter implements the TokenFilter interface so that the given
+// discardFilter implements the tokenFilter interface so that the given
 // token is discarded or accepted.
-type DiscardFilter struct{}
+type discardFilter struct{}
 
 // Filter the given token so that a `nil` slice is returned if the token
 // is in the token filtered list.
-func (f *DiscardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
+func (f *discardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
 	// filters based on previous token
 	switch lastToken {
+	case FilteredBracketedIdentifier:
+		if token != ']' {
+			// we haven't found the closing bracket yet, keep going
+			if token != ID {
+				// the token between the brackets *must* be an identifier,
+				// otherwise the query is invalid.
+				return LexError, nil
+			}
+			return FilteredBracketedIdentifier, nil
+		}
+		fallthrough
 	case As:
-		// prevent the next comma from being part of a GroupingFilter
+		if token == '[' {
+			// the identifier followed by AS is an MSSQL bracketed identifier
+			// and will continue to be discarded until we find the corresponding
+			// closing bracket counter-part. See GitHub issue #475.
+			return FilteredBracketedIdentifier, nil
+		}
+		// prevent the next comma from being part of a groupingFilter
 		return FilteredComma, nil
 	}
 
@@ -48,15 +65,15 @@ func (f *DiscardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte
 	}
 }
 
-// Reset in a DiscardFilter is a noop action
-func (f *DiscardFilter) Reset() {}
+// Reset in a discardFilter is a noop action
+func (f *discardFilter) Reset() {}
 
-// ReplaceFilter implements the TokenFilter interface so that the given
+// replaceFilter implements the tokenFilter interface so that the given
 // token is replaced with '?' or left unchanged.
-type ReplaceFilter struct{}
+type replaceFilter struct{}
 
 // Filter the given token so that it will be replaced if in the token replacement list
-func (f *ReplaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
+func (f *replaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
 	switch lastToken {
 	case Savepoint:
 		return Filtered, []byte("?")
@@ -69,12 +86,12 @@ func (f *ReplaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte
 	}
 }
 
-// Reset in a ReplaceFilter is a noop action
-func (f *ReplaceFilter) Reset() {}
+// Reset in a replaceFilter is a noop action
+func (f *replaceFilter) Reset() {}
 
-// GroupingFilter implements the TokenFilter interface so that when
+// groupingFilter implements the tokenFilter interface so that when
 // a common pattern is identified, it's discarded to prevent duplicates
-type GroupingFilter struct {
+type groupingFilter struct {
 	groupFilter int
 	groupMulti  int
 }
@@ -83,7 +100,7 @@ type GroupingFilter struct {
 // has been recognized. A grouping is composed by items like:
 //   * '( ?, ?, ? )'
 //   * '( ?, ? ), ( ?, ? )'
-func (f *GroupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
+func (f *groupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
 	// increasing the number of groups means that we're filtering an entire group
 	// because it can be represented with a single '( ? )'
 	if (lastToken == '(' && token == Filtered) || (token == '(' && f.groupMulti > 0) {
@@ -115,26 +132,26 @@ func (f *GroupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byt
 	return token, buffer
 }
 
-// Reset in a GroupingFilter restores variables used to count
+// Reset in a groupingFilter restores variables used to count
 // escaped token that should be filtered
-func (f *GroupingFilter) Reset() {
+func (f *groupingFilter) Reset() {
 	f.groupFilter = 0
 	f.groupMulti = 0
 }
 
-// TokenConsumer is a Tokenizer consumer. It calls the Tokenizer Scan() function until tokens
+// tokenConsumer is a Tokenizer consumer. It calls the Tokenizer Scan() function until tokens
 // are available or if a LEX_ERROR is raised. After retrieving a token, it is sent in the
-// TokenFilter chains so that the token is discarded or replaced.
-type TokenConsumer struct {
+// tokenFilter chains so that the token is discarded or replaced.
+type tokenConsumer struct {
 	tokenizer *Tokenizer
-	filters   []TokenFilter
+	filters   []tokenFilter
 	lastToken int
 }
 
 // Process the given SQL or No-SQL string so that the resulting one is properly altered. This
-// function is generic and the behavior changes according to chosen TokenFilter implementations.
-// The process calls all filters inside the []TokenFilter.
-func (t *TokenConsumer) Process(in string) (string, error) {
+// function is generic and the behavior changes according to chosen tokenFilter implementations.
+// The process calls all filters inside the []tokenFilter.
+func (t *tokenConsumer) Process(in string) (string, error) {
 	out := &bytes.Buffer{}
 	t.tokenizer.InStream.Reset(in)
 
@@ -150,7 +167,10 @@ func (t *TokenConsumer) Process(in string) (string, error) {
 
 		// apply all registered filters
 		for _, f := range t.filters {
-			token, buff = f.Filter(token, t.lastToken, buff)
+			if token, buff = f.Filter(token, t.lastToken, buff); token == LexError {
+				t.Reset()
+				return "", errors.New("the tokenizer was unable to process the string")
+			}
 		}
 
 		// write the resulting buffer
@@ -182,38 +202,32 @@ func (t *TokenConsumer) Process(in string) (string, error) {
 }
 
 // Reset restores the initial states for all components so that memory can be re-used
-func (t *TokenConsumer) Reset() {
+func (t *tokenConsumer) Reset() {
 	t.tokenizer.Reset()
 	for _, f := range t.filters {
 		f.Reset()
 	}
 }
 
-// NewTokenConsumer returns a new TokenConsumer capable to process SQL and No-SQL strings.
-func NewTokenConsumer(filters []TokenFilter) *TokenConsumer {
-	return &TokenConsumer{
+// newTokenConsumer returns a new tokenConsumer capable to process SQL and No-SQL strings.
+func newTokenConsumer() *tokenConsumer {
+	return &tokenConsumer{
 		tokenizer: NewStringTokenizer(""),
-		filters:   filters,
+		filters: []tokenFilter{
+			&discardFilter{},
+			&replaceFilter{},
+			&groupingFilter{},
+		},
 	}
 }
 
-// token consumer that will quantize the query with
-// the given filters; this quantizer is used only
-// for SQL and CQL strings
-var tokenQuantizer = NewTokenConsumer(
-	[]TokenFilter{
-		&DiscardFilter{},
-		&ReplaceFilter{},
-		&GroupingFilter{},
-	})
-
 // QuantizeSQL generates resource and sql.query meta for SQL spans
-func (*Obfuscator) obfuscateSQL(span *model.Span) {
+func (o *Obfuscator) obfuscateSQL(span *model.Span) {
 	if span.Resource == "" {
 		return
 	}
 
-	quantizedString, err := tokenQuantizer.Process(span.Resource)
+	quantizedString, err := o.sql.Process(span.Resource)
 	if err != nil || quantizedString == "" {
 		// if we have an error, the partially parsed SQL is discarded so that we don't pollute
 		// users resources. Here we provide more details to debug the problem.

diff --git a/obfuscate/sql_test.go b/obfuscate/sql_test.go
@@ -335,6 +335,18 @@ func TestSQLQuantizer(t *testing.T) {
   `,
 			`SELECT t1.userid, t1.fullname, t1.firm_id, t2.firmname, t1.email, t1.location, t1.state, t1.phone, t1.url, DATE_FORMAT ( t1.lastmod, %m/%d/%Y %h:%i:%s ), t1.lastmod, t1.user_status, t1.pw_expire, DATE_FORMAT ( t1.pw_expire, %m/%d/%Y ), t1.addr1, t1.addr2, t1.zipcode, t1.office_id, t1.default_group, t3.firm_status, t1.title FROM userdata LEFT JOIN lawfirm_names ON t1.firm_id = t2.firm_id LEFT JOIN lawfirms ON t1.firm_id = t3.firm_id WHERE t1.userid = ?`,
 		},
+		{
+			`SELECT [b].[BlogId], [b].[Name]
+FROM [Blogs] AS [b]
+ORDER BY [b].[Name]`,
+			`SELECT [ b ] . [ BlogId ], [ b ] . [ Name ] FROM [ Blogs ] ORDER BY [ b ] . [ Name ]`,
+		},
+		{
+			`SELECT [b].[BlogId], [b].[Name]
+FROM [Blogs] AS [b
+ORDER BY [b].[Name]`,
+			`Non-parsable SQL query`,
+		},
 	}
 
 	for _, c := range cases {
@@ -361,14 +373,8 @@ func TestMultipleProcess(t *testing.T) {
 		},
 	}
 
-	filters := []TokenFilter{
-		&DiscardFilter{},
-		&ReplaceFilter{},
-		&GroupingFilter{},
-	}
-
 	// The consumer is the same between executions
-	consumer := NewTokenConsumer(filters)
+	consumer := newTokenConsumer()
 
 	for _, tc := range testCases {
 		output, err := consumer.Process(tc.query)
@@ -383,12 +389,7 @@ func TestConsumerError(t *testing.T) {
 	// Malformed SQL is not accepted and the outer component knows
 	// what to do with malformed SQL
 	input := "SELECT * FROM users WHERE users.id = '1 AND users.name = 'dog'"
-	filters := []TokenFilter{
-		&DiscardFilter{},
-		&ReplaceFilter{},
-		&GroupingFilter{},
-	}
-	consumer := NewTokenConsumer(filters)
+	consumer := newTokenConsumer()
 
 	output, err := consumer.Process(input)
 	assert.NotNil(err)
@@ -404,12 +405,7 @@ func BenchmarkTokenizer(b *testing.B) {
 		{"Escaping", `INSERT INTO delayed_jobs (attempts, created_at, failed_at, handler, last_error, locked_at, locked_by, priority, queue, run_at, updated_at) VALUES (0, '2016-12-04 17:09:59', NULL, '--- !ruby/object:Delayed::PerformableMethod\nobject: !ruby/object:Item\n  store:\n  - a simple string\n  - an \'escaped \' string\n  - another \'escaped\' string\n  - 42\n  string: a string with many \\\\\'escapes\\\\\'\nmethod_name: :show_store\nargs: []\n', NULL, NULL, NULL, 0, NULL, '2016-12-04 17:09:59', '2016-12-04 17:09:59')`},
 		{"Grouping", `INSERT INTO delayed_jobs (created_at, failed_at, handler) VALUES (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL)`},
 	}
-	filters := []TokenFilter{
-		&DiscardFilter{},
-		&ReplaceFilter{},
-		&GroupingFilter{},
-	}
-	consumer := NewTokenConsumer(filters)
+	consumer := newTokenConsumer()
 
 	for _, bm := range benchmarks {
 		b.Run(bm.name+"/"+strconv.Itoa(len(bm.query)), func(b *testing.B) {

diff --git a/obfuscate/sql_tokenizer.go b/obfuscate/sql_tokenizer.go
@@ -44,6 +44,11 @@ const (
 	// FilteredComma specifies that the token is a comma and was discarded by one
 	// of the filters.
 	FilteredComma = 57366
+
+	// FilteredBracketedIdentifier specifies that we are currently discarding
+	// a bracketed identifier (MSSQL).
+	// See issue https://github.com/DataDog/datadog-trace-agent/issues/475.
+	FilteredBracketedIdentifier = 57367
 )
 
 // Tokenizer is the struct used to generate SQL