Skip to content
This repository has been archived by the owner on Aug 30, 2019. It is now read-only.

Commit

Permalink
obfuscate: account for bracketed identifiers as AS parameters (#474)
Browse files Browse the repository at this point in the history
* obfuscate: account for bracketed identifiers as AS parameters

* obfuscate: made tokenConsumer resettable and private
  • Loading branch information
gbbr authored Sep 26, 2018
1 parent af98b3c commit c200dbb
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 63 deletions.
6 changes: 5 additions & 1 deletion obfuscate/obfuscate.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
// concurrent use.
type Obfuscator struct {
opts *config.ObfuscationConfig
sql *tokenConsumer
es *jsonObfuscator // nil if disabled
mongo *jsonObfuscator // nil if disabled
}
Expand All @@ -22,7 +23,10 @@ func NewObfuscator(cfg *config.ObfuscationConfig) *Obfuscator {
if cfg == nil {
cfg = new(config.ObfuscationConfig)
}
o := Obfuscator{opts: cfg}
o := Obfuscator{
opts: cfg,
sql: newTokenConsumer(),
}
if cfg.ES.Enabled {
o.es = newJSONObfuscator(&cfg.ES)
}
Expand Down
100 changes: 57 additions & 43 deletions obfuscate/sql.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,43 @@ const (
sqlQuantizeError = "agent.parse.error"
)

// TokenFilter is a generic interface that a TokenConsumer expects. It defines
// tokenFilter is a generic interface that a tokenConsumer expects. It defines
// the Filter() function used to filter or replace given tokens.
// A filter can be stateful and keep an internal state to apply the filter later;
// this can be useful to prevent backtracking in some cases.
type TokenFilter interface {
type tokenFilter interface {
Filter(token, lastToken int, buffer []byte) (int, []byte)
Reset()
}

// DiscardFilter implements the TokenFilter interface so that the given
// discardFilter implements the tokenFilter interface so that the given
// token is discarded or accepted.
type DiscardFilter struct{}
type discardFilter struct{}

// Filter the given token so that a `nil` slice is returned if the token
// is in the token filtered list.
func (f *DiscardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
func (f *discardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
// filters based on previous token
switch lastToken {
case FilteredBracketedIdentifier:
if token != ']' {
// we haven't found the closing bracket yet, keep going
if token != ID {
// the token between the brackets *must* be an identifier,
// otherwise the query is invalid.
return LexError, nil
}
return FilteredBracketedIdentifier, nil
}
fallthrough
case As:
// prevent the next comma from being part of a GroupingFilter
if token == '[' {
// the identifier followed by AS is an MSSQL bracketed identifier
// and will continue to be discarded until we find the corresponding
// closing bracket counter-part. See GitHub issue #475.
return FilteredBracketedIdentifier, nil
}
// prevent the next comma from being part of a groupingFilter
return FilteredComma, nil
}

Expand All @@ -48,15 +65,15 @@ func (f *DiscardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte
}
}

// Reset in a DiscardFilter is a noop action
func (f *DiscardFilter) Reset() {}
// Reset in a discardFilter is a noop action
func (f *discardFilter) Reset() {}

// ReplaceFilter implements the TokenFilter interface so that the given
// replaceFilter implements the tokenFilter interface so that the given
// token is replaced with '?' or left unchanged.
type ReplaceFilter struct{}
type replaceFilter struct{}

// Filter the given token so that it will be replaced if in the token replacement list
func (f *ReplaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
func (f *replaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
switch lastToken {
case Savepoint:
return Filtered, []byte("?")
Expand All @@ -69,12 +86,12 @@ func (f *ReplaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte
}
}

// Reset in a ReplaceFilter is a noop action
func (f *ReplaceFilter) Reset() {}
// Reset in a replaceFilter is a noop action
func (f *replaceFilter) Reset() {}

// GroupingFilter implements the TokenFilter interface so that when
// groupingFilter implements the tokenFilter interface so that when
// a common pattern is identified, it's discarded to prevent duplicates
type GroupingFilter struct {
type groupingFilter struct {
groupFilter int
groupMulti int
}
Expand All @@ -83,7 +100,7 @@ type GroupingFilter struct {
// has been recognized. A grouping is composed by items like:
// * '( ?, ?, ? )'
// * '( ?, ? ), ( ?, ? )'
func (f *GroupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
func (f *groupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
// increasing the number of groups means that we're filtering an entire group
// because it can be represented with a single '( ? )'
if (lastToken == '(' && token == Filtered) || (token == '(' && f.groupMulti > 0) {
Expand Down Expand Up @@ -115,26 +132,26 @@ func (f *GroupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byt
return token, buffer
}

// Reset in a GroupingFilter restores variables used to count
// Reset in a groupingFilter restores variables used to count
// escaped token that should be filtered
func (f *GroupingFilter) Reset() {
func (f *groupingFilter) Reset() {
f.groupFilter = 0
f.groupMulti = 0
}

// TokenConsumer is a Tokenizer consumer. It calls the Tokenizer Scan() function until tokens
// tokenConsumer is a Tokenizer consumer. It calls the Tokenizer Scan() function until tokens
// are available or if a LEX_ERROR is raised. After retrieving a token, it is sent in the
// TokenFilter chains so that the token is discarded or replaced.
type TokenConsumer struct {
// tokenFilter chains so that the token is discarded or replaced.
type tokenConsumer struct {
tokenizer *Tokenizer
filters []TokenFilter
filters []tokenFilter
lastToken int
}

// Process the given SQL or No-SQL string so that the resulting one is properly altered. This
// function is generic and the behavior changes according to chosen TokenFilter implementations.
// The process calls all filters inside the []TokenFilter.
func (t *TokenConsumer) Process(in string) (string, error) {
// function is generic and the behavior changes according to chosen tokenFilter implementations.
// The process calls all filters inside the []tokenFilter.
func (t *tokenConsumer) Process(in string) (string, error) {
out := &bytes.Buffer{}
t.tokenizer.InStream.Reset(in)

Expand All @@ -150,7 +167,10 @@ func (t *TokenConsumer) Process(in string) (string, error) {

// apply all registered filters
for _, f := range t.filters {
token, buff = f.Filter(token, t.lastToken, buff)
if token, buff = f.Filter(token, t.lastToken, buff); token == LexError {
t.Reset()
return "", errors.New("the tokenizer was unable to process the string")
}
}

// write the resulting buffer
Expand Down Expand Up @@ -182,38 +202,32 @@ func (t *TokenConsumer) Process(in string) (string, error) {
}

// Reset restores the initial states for all components so that memory can be re-used
func (t *TokenConsumer) Reset() {
func (t *tokenConsumer) Reset() {
t.tokenizer.Reset()
for _, f := range t.filters {
f.Reset()
}
}

// NewTokenConsumer returns a new TokenConsumer capable to process SQL and No-SQL strings.
func NewTokenConsumer(filters []TokenFilter) *TokenConsumer {
return &TokenConsumer{
// newTokenConsumer returns a new tokenConsumer capable to process SQL and No-SQL strings.
func newTokenConsumer() *tokenConsumer {
return &tokenConsumer{
tokenizer: NewStringTokenizer(""),
filters: filters,
filters: []tokenFilter{
&discardFilter{},
&replaceFilter{},
&groupingFilter{},
},
}
}

// token consumer that will quantize the query with
// the given filters; this quantizer is used only
// for SQL and CQL strings
var tokenQuantizer = NewTokenConsumer(
[]TokenFilter{
&DiscardFilter{},
&ReplaceFilter{},
&GroupingFilter{},
})

// QuantizeSQL generates resource and sql.query meta for SQL spans
func (*Obfuscator) obfuscateSQL(span *model.Span) {
func (o *Obfuscator) obfuscateSQL(span *model.Span) {
if span.Resource == "" {
return
}

quantizedString, err := tokenQuantizer.Process(span.Resource)
quantizedString, err := o.sql.Process(span.Resource)
if err != nil || quantizedString == "" {
// if we have an error, the partially parsed SQL is discarded so that we don't pollute
// users resources. Here we provide more details to debug the problem.
Expand Down
34 changes: 15 additions & 19 deletions obfuscate/sql_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,18 @@ func TestSQLQuantizer(t *testing.T) {
`,
`SELECT t1.userid, t1.fullname, t1.firm_id, t2.firmname, t1.email, t1.location, t1.state, t1.phone, t1.url, DATE_FORMAT ( t1.lastmod, %m/%d/%Y %h:%i:%s ), t1.lastmod, t1.user_status, t1.pw_expire, DATE_FORMAT ( t1.pw_expire, %m/%d/%Y ), t1.addr1, t1.addr2, t1.zipcode, t1.office_id, t1.default_group, t3.firm_status, t1.title FROM userdata LEFT JOIN lawfirm_names ON t1.firm_id = t2.firm_id LEFT JOIN lawfirms ON t1.firm_id = t3.firm_id WHERE t1.userid = ?`,
},
{
`SELECT [b].[BlogId], [b].[Name]
FROM [Blogs] AS [b]
ORDER BY [b].[Name]`,
`SELECT [ b ] . [ BlogId ], [ b ] . [ Name ] FROM [ Blogs ] ORDER BY [ b ] . [ Name ]`,
},
{
`SELECT [b].[BlogId], [b].[Name]
FROM [Blogs] AS [b
ORDER BY [b].[Name]`,
`Non-parsable SQL query`,
},
}

for _, c := range cases {
Expand All @@ -361,14 +373,8 @@ func TestMultipleProcess(t *testing.T) {
},
}

filters := []TokenFilter{
&DiscardFilter{},
&ReplaceFilter{},
&GroupingFilter{},
}

// The consumer is the same between executions
consumer := NewTokenConsumer(filters)
consumer := newTokenConsumer()

for _, tc := range testCases {
output, err := consumer.Process(tc.query)
Expand All @@ -383,12 +389,7 @@ func TestConsumerError(t *testing.T) {
// Malformed SQL is not accepted and the outer component knows
// what to do with malformed SQL
input := "SELECT * FROM users WHERE users.id = '1 AND users.name = 'dog'"
filters := []TokenFilter{
&DiscardFilter{},
&ReplaceFilter{},
&GroupingFilter{},
}
consumer := NewTokenConsumer(filters)
consumer := newTokenConsumer()

output, err := consumer.Process(input)
assert.NotNil(err)
Expand All @@ -404,12 +405,7 @@ func BenchmarkTokenizer(b *testing.B) {
{"Escaping", `INSERT INTO delayed_jobs (attempts, created_at, failed_at, handler, last_error, locked_at, locked_by, priority, queue, run_at, updated_at) VALUES (0, '2016-12-04 17:09:59', NULL, '--- !ruby/object:Delayed::PerformableMethod\nobject: !ruby/object:Item\n store:\n - a simple string\n - an \'escaped \' string\n - another \'escaped\' string\n - 42\n string: a string with many \\\\\'escapes\\\\\'\nmethod_name: :show_store\nargs: []\n', NULL, NULL, NULL, 0, NULL, '2016-12-04 17:09:59', '2016-12-04 17:09:59')`},
{"Grouping", `INSERT INTO delayed_jobs (created_at, failed_at, handler) VALUES (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL)`},
}
filters := []TokenFilter{
&DiscardFilter{},
&ReplaceFilter{},
&GroupingFilter{},
}
consumer := NewTokenConsumer(filters)
consumer := newTokenConsumer()

for _, bm := range benchmarks {
b.Run(bm.name+"/"+strconv.Itoa(len(bm.query)), func(b *testing.B) {
Expand Down
5 changes: 5 additions & 0 deletions obfuscate/sql_tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ const (
// FilteredComma specifies that the token is a comma and was discarded by one
// of the filters.
FilteredComma = 57366

// FilteredBracketedIdentifier specifies that we are currently discarding
// a bracketed identifier (MSSQL).
// See issue https://github.com/DataDog/datadog-trace-agent/issues/475.
FilteredBracketedIdentifier = 57367
)

// Tokenizer is the struct used to generate SQL
Expand Down

0 comments on commit c200dbb

Please sign in to comment.