diff --git a/obfuscate/obfuscate.go b/obfuscate/obfuscate.go index b98ca2b38..82c4217bf 100644 --- a/obfuscate/obfuscate.go +++ b/obfuscate/obfuscate.go @@ -13,6 +13,7 @@ import ( // concurrent use. type Obfuscator struct { opts *config.ObfuscationConfig + sql *tokenConsumer es *jsonObfuscator // nil if disabled mongo *jsonObfuscator // nil if disabled } @@ -22,7 +23,10 @@ func NewObfuscator(cfg *config.ObfuscationConfig) *Obfuscator { if cfg == nil { cfg = new(config.ObfuscationConfig) } - o := Obfuscator{opts: cfg} + o := Obfuscator{ + opts: cfg, + sql: newTokenConsumer(), + } if cfg.ES.Enabled { o.es = newJSONObfuscator(&cfg.ES) } diff --git a/obfuscate/sql.go b/obfuscate/sql.go index 208df94ba..603cb114e 100644 --- a/obfuscate/sql.go +++ b/obfuscate/sql.go @@ -13,26 +13,43 @@ const ( sqlQuantizeError = "agent.parse.error" ) -// TokenFilter is a generic interface that a TokenConsumer expects. It defines +// tokenFilter is a generic interface that a tokenConsumer expects. It defines // the Filter() function used to filter or replace given tokens. // A filter can be stateful and keep an internal state to apply the filter later; // this can be useful to prevent backtracking in some cases. -type TokenFilter interface { +type tokenFilter interface { Filter(token, lastToken int, buffer []byte) (int, []byte) Reset() } -// DiscardFilter implements the TokenFilter interface so that the given +// discardFilter implements the tokenFilter interface so that the given // token is discarded or accepted. -type DiscardFilter struct{} +type discardFilter struct{} // Filter the given token so that a `nil` slice is returned if the token // is in the token filtered list. -func (f *DiscardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) { +func (f *discardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) { // filters based on previous token switch lastToken { + case FilteredBracketedIdentifier: + if token != ']' { + // we haven't found the closing bracket yet, keep going + if token != ID { + // the token between the brackets *must* be an identifier, + // otherwise the query is invalid. + return LexError, nil + } + return FilteredBracketedIdentifier, nil + } + fallthrough case As: - // prevent the next comma from being part of a GroupingFilter + if token == '[' { + // the identifier followed by AS is an MSSQL bracketed identifier + // and will continue to be discarded until we find the corresponding + // closing bracket counter-part. See GitHub issue #475. + return FilteredBracketedIdentifier, nil + } + // prevent the next comma from being part of a groupingFilter return FilteredComma, nil } @@ -48,15 +65,15 @@ func (f *DiscardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte } } -// Reset in a DiscardFilter is a noop action -func (f *DiscardFilter) Reset() {} +// Reset in a discardFilter is a noop action +func (f *discardFilter) Reset() {} -// ReplaceFilter implements the TokenFilter interface so that the given +// replaceFilter implements the tokenFilter interface so that the given // token is replaced with '?' or left unchanged. -type ReplaceFilter struct{} +type replaceFilter struct{} // Filter the given token so that it will be replaced if in the token replacement list -func (f *ReplaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) { +func (f *replaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) { switch lastToken { case Savepoint: return Filtered, []byte("?") @@ -69,12 +86,12 @@ func (f *ReplaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte } } -// Reset in a ReplaceFilter is a noop action -func (f *ReplaceFilter) Reset() {} +// Reset in a replaceFilter is a noop action +func (f *replaceFilter) Reset() {} -// GroupingFilter implements the TokenFilter interface so that when +// groupingFilter implements the tokenFilter interface so that when // a common pattern is identified, it's discarded to prevent duplicates -type GroupingFilter struct { +type groupingFilter struct { groupFilter int groupMulti int } @@ -83,7 +100,7 @@ type GroupingFilter struct { // has been recognized. A grouping is composed by items like: // * '( ?, ?, ? )' // * '( ?, ? ), ( ?, ? )' -func (f *GroupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) { +func (f *groupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) { // increasing the number of groups means that we're filtering an entire group // because it can be represented with a single '( ? )' if (lastToken == '(' && token == Filtered) || (token == '(' && f.groupMulti > 0) { @@ -115,26 +132,26 @@ func (f *GroupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byt return token, buffer } -// Reset in a GroupingFilter restores variables used to count +// Reset in a groupingFilter restores variables used to count // escaped token that should be filtered -func (f *GroupingFilter) Reset() { +func (f *groupingFilter) Reset() { f.groupFilter = 0 f.groupMulti = 0 } -// TokenConsumer is a Tokenizer consumer. It calls the Tokenizer Scan() function until tokens +// tokenConsumer is a Tokenizer consumer. It calls the Tokenizer Scan() function until tokens // are available or if a LEX_ERROR is raised. After retrieving a token, it is sent in the -// TokenFilter chains so that the token is discarded or replaced. -type TokenConsumer struct { +// tokenFilter chains so that the token is discarded or replaced. +type tokenConsumer struct { tokenizer *Tokenizer - filters []TokenFilter + filters []tokenFilter lastToken int } // Process the given SQL or No-SQL string so that the resulting one is properly altered. This -// function is generic and the behavior changes according to chosen TokenFilter implementations. -// The process calls all filters inside the []TokenFilter. -func (t *TokenConsumer) Process(in string) (string, error) { +// function is generic and the behavior changes according to chosen tokenFilter implementations. +// The process calls all filters inside the []tokenFilter. +func (t *tokenConsumer) Process(in string) (string, error) { out := &bytes.Buffer{} t.tokenizer.InStream.Reset(in) @@ -150,7 +167,10 @@ func (t *TokenConsumer) Process(in string) (string, error) { // apply all registered filters for _, f := range t.filters { - token, buff = f.Filter(token, t.lastToken, buff) + if token, buff = f.Filter(token, t.lastToken, buff); token == LexError { + t.Reset() + return "", errors.New("the tokenizer was unable to process the string") + } } // write the resulting buffer @@ -182,38 +202,32 @@ func (t *TokenConsumer) Process(in string) (string, error) { } // Reset restores the initial states for all components so that memory can be re-used -func (t *TokenConsumer) Reset() { +func (t *tokenConsumer) Reset() { t.tokenizer.Reset() for _, f := range t.filters { f.Reset() } } -// NewTokenConsumer returns a new TokenConsumer capable to process SQL and No-SQL strings. -func NewTokenConsumer(filters []TokenFilter) *TokenConsumer { - return &TokenConsumer{ +// newTokenConsumer returns a new tokenConsumer capable to process SQL and No-SQL strings. +func newTokenConsumer() *tokenConsumer { + return &tokenConsumer{ tokenizer: NewStringTokenizer(""), - filters: filters, + filters: []tokenFilter{ + &discardFilter{}, + &replaceFilter{}, + &groupingFilter{}, + }, } } -// token consumer that will quantize the query with -// the given filters; this quantizer is used only -// for SQL and CQL strings -var tokenQuantizer = NewTokenConsumer( - []TokenFilter{ - &DiscardFilter{}, - &ReplaceFilter{}, - &GroupingFilter{}, - }) - // QuantizeSQL generates resource and sql.query meta for SQL spans -func (*Obfuscator) obfuscateSQL(span *model.Span) { +func (o *Obfuscator) obfuscateSQL(span *model.Span) { if span.Resource == "" { return } - quantizedString, err := tokenQuantizer.Process(span.Resource) + quantizedString, err := o.sql.Process(span.Resource) if err != nil || quantizedString == "" { // if we have an error, the partially parsed SQL is discarded so that we don't pollute // users resources. Here we provide more details to debug the problem. diff --git a/obfuscate/sql_test.go b/obfuscate/sql_test.go index 29a63cdb9..ef800e47c 100644 --- a/obfuscate/sql_test.go +++ b/obfuscate/sql_test.go @@ -335,6 +335,18 @@ func TestSQLQuantizer(t *testing.T) { `, `SELECT t1.userid, t1.fullname, t1.firm_id, t2.firmname, t1.email, t1.location, t1.state, t1.phone, t1.url, DATE_FORMAT ( t1.lastmod, %m/%d/%Y %h:%i:%s ), t1.lastmod, t1.user_status, t1.pw_expire, DATE_FORMAT ( t1.pw_expire, %m/%d/%Y ), t1.addr1, t1.addr2, t1.zipcode, t1.office_id, t1.default_group, t3.firm_status, t1.title FROM userdata LEFT JOIN lawfirm_names ON t1.firm_id = t2.firm_id LEFT JOIN lawfirms ON t1.firm_id = t3.firm_id WHERE t1.userid = ?`, }, + { + `SELECT [b].[BlogId], [b].[Name] +FROM [Blogs] AS [b] +ORDER BY [b].[Name]`, + `SELECT [ b ] . [ BlogId ], [ b ] . [ Name ] FROM [ Blogs ] ORDER BY [ b ] . [ Name ]`, + }, + { + `SELECT [b].[BlogId], [b].[Name] +FROM [Blogs] AS [b +ORDER BY [b].[Name]`, + `Non-parsable SQL query`, + }, } for _, c := range cases { @@ -361,14 +373,8 @@ func TestMultipleProcess(t *testing.T) { }, } - filters := []TokenFilter{ - &DiscardFilter{}, - &ReplaceFilter{}, - &GroupingFilter{}, - } - // The consumer is the same between executions - consumer := NewTokenConsumer(filters) + consumer := newTokenConsumer() for _, tc := range testCases { output, err := consumer.Process(tc.query) @@ -383,12 +389,7 @@ func TestConsumerError(t *testing.T) { // Malformed SQL is not accepted and the outer component knows // what to do with malformed SQL input := "SELECT * FROM users WHERE users.id = '1 AND users.name = 'dog'" - filters := []TokenFilter{ - &DiscardFilter{}, - &ReplaceFilter{}, - &GroupingFilter{}, - } - consumer := NewTokenConsumer(filters) + consumer := newTokenConsumer() output, err := consumer.Process(input) assert.NotNil(err) @@ -404,12 +405,7 @@ func BenchmarkTokenizer(b *testing.B) { {"Escaping", `INSERT INTO delayed_jobs (attempts, created_at, failed_at, handler, last_error, locked_at, locked_by, priority, queue, run_at, updated_at) VALUES (0, '2016-12-04 17:09:59', NULL, '--- !ruby/object:Delayed::PerformableMethod\nobject: !ruby/object:Item\n store:\n - a simple string\n - an \'escaped \' string\n - another \'escaped\' string\n - 42\n string: a string with many \\\\\'escapes\\\\\'\nmethod_name: :show_store\nargs: []\n', NULL, NULL, NULL, 0, NULL, '2016-12-04 17:09:59', '2016-12-04 17:09:59')`}, {"Grouping", `INSERT INTO delayed_jobs (created_at, failed_at, handler) VALUES (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL)`}, } - filters := []TokenFilter{ - &DiscardFilter{}, - &ReplaceFilter{}, - &GroupingFilter{}, - } - consumer := NewTokenConsumer(filters) + consumer := newTokenConsumer() for _, bm := range benchmarks { b.Run(bm.name+"/"+strconv.Itoa(len(bm.query)), func(b *testing.B) { diff --git a/obfuscate/sql_tokenizer.go b/obfuscate/sql_tokenizer.go index d17e5f40b..9b608b74a 100644 --- a/obfuscate/sql_tokenizer.go +++ b/obfuscate/sql_tokenizer.go @@ -44,6 +44,11 @@ const ( // FilteredComma specifies that the token is a comma and was discarded by one // of the filters. FilteredComma = 57366 + + // FilteredBracketedIdentifier specifies that we are currently discarding + // a bracketed identifier (MSSQL). + // See issue https://github.com/DataDog/datadog-trace-agent/issues/475. + FilteredBracketedIdentifier = 57367 ) // Tokenizer is the struct used to generate SQL