Skip to content
This repository has been archived by the owner on Aug 30, 2019. It is now read-only.

obfuscate: account for bracketed identifiers as AS parameters #474

Merged
merged 2 commits into from
Sep 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion obfuscate/obfuscate.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
// concurrent use.
type Obfuscator struct {
opts *config.ObfuscationConfig
sql *tokenConsumer
es *jsonObfuscator // nil if disabled
mongo *jsonObfuscator // nil if disabled
}
Expand All @@ -22,7 +23,10 @@ func NewObfuscator(cfg *config.ObfuscationConfig) *Obfuscator {
if cfg == nil {
cfg = new(config.ObfuscationConfig)
}
o := Obfuscator{opts: cfg}
o := Obfuscator{
opts: cfg,
sql: newTokenConsumer(),
}
if cfg.ES.Enabled {
o.es = newJSONObfuscator(&cfg.ES)
}
Expand Down
100 changes: 57 additions & 43 deletions obfuscate/sql.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,43 @@ const (
sqlQuantizeError = "agent.parse.error"
)

// TokenFilter is a generic interface that a TokenConsumer expects. It defines
// tokenFilter is a generic interface that a tokenConsumer expects. It defines
// the Filter() function used to filter or replace given tokens.
// A filter can be stateful and keep an internal state to apply the filter later;
// this can be useful to prevent backtracking in some cases.
type TokenFilter interface {
type tokenFilter interface {
Filter(token, lastToken int, buffer []byte) (int, []byte)
Reset()
}

// DiscardFilter implements the TokenFilter interface so that the given
// discardFilter implements the tokenFilter interface so that the given
// token is discarded or accepted.
type DiscardFilter struct{}
type discardFilter struct{}

// Filter the given token so that a `nil` slice is returned if the token
// is in the token filtered list.
func (f *DiscardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
func (f *discardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
// filters based on previous token
switch lastToken {
case FilteredBracketedIdentifier:
if token != ']' {
// we haven't found the closing bracket yet, keep going
if token != ID {
// the token between the brackets *must* be an identifier,
// otherwise the query is invalid.
return LexError, nil
}
return FilteredBracketedIdentifier, nil
}
fallthrough
case As:
// prevent the next comma from being part of a GroupingFilter
if token == '[' {
// the identifier followed by AS is an MSSQL bracketed identifier
// and will continue to be discarded until we find the corresponding
// closing bracket counter-part. See GitHub issue #475.
return FilteredBracketedIdentifier, nil
}
// prevent the next comma from being part of a groupingFilter
return FilteredComma, nil
}

Expand All @@ -48,15 +65,15 @@ func (f *DiscardFilter) Filter(token, lastToken int, buffer []byte) (int, []byte
}
}

// Reset in a DiscardFilter is a noop action
func (f *DiscardFilter) Reset() {}
// Reset in a discardFilter is a noop action
func (f *discardFilter) Reset() {}

// ReplaceFilter implements the TokenFilter interface so that the given
// replaceFilter implements the tokenFilter interface so that the given
// token is replaced with '?' or left unchanged.
type ReplaceFilter struct{}
type replaceFilter struct{}

// Filter the given token so that it will be replaced if in the token replacement list
func (f *ReplaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
func (f *replaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
switch lastToken {
case Savepoint:
return Filtered, []byte("?")
Expand All @@ -69,12 +86,12 @@ func (f *ReplaceFilter) Filter(token, lastToken int, buffer []byte) (int, []byte
}
}

// Reset in a ReplaceFilter is a noop action
func (f *ReplaceFilter) Reset() {}
// Reset in a replaceFilter is a noop action
func (f *replaceFilter) Reset() {}

// GroupingFilter implements the TokenFilter interface so that when
// groupingFilter implements the tokenFilter interface so that when
// a common pattern is identified, it's discarded to prevent duplicates
type GroupingFilter struct {
type groupingFilter struct {
groupFilter int
groupMulti int
}
Expand All @@ -83,7 +100,7 @@ type GroupingFilter struct {
// has been recognized. A grouping is composed by items like:
// * '( ?, ?, ? )'
// * '( ?, ? ), ( ?, ? )'
func (f *GroupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
func (f *groupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byte) {
// increasing the number of groups means that we're filtering an entire group
// because it can be represented with a single '( ? )'
if (lastToken == '(' && token == Filtered) || (token == '(' && f.groupMulti > 0) {
Expand Down Expand Up @@ -115,26 +132,26 @@ func (f *GroupingFilter) Filter(token, lastToken int, buffer []byte) (int, []byt
return token, buffer
}

// Reset in a GroupingFilter restores variables used to count
// Reset in a groupingFilter restores variables used to count
// escaped token that should be filtered
func (f *GroupingFilter) Reset() {
func (f *groupingFilter) Reset() {
f.groupFilter = 0
f.groupMulti = 0
}

// TokenConsumer is a Tokenizer consumer. It calls the Tokenizer Scan() function until tokens
// tokenConsumer is a Tokenizer consumer. It calls the Tokenizer Scan() function until tokens
// are available or if a LEX_ERROR is raised. After retrieving a token, it is sent in the
// TokenFilter chains so that the token is discarded or replaced.
type TokenConsumer struct {
// tokenFilter chains so that the token is discarded or replaced.
type tokenConsumer struct {
tokenizer *Tokenizer
filters []TokenFilter
filters []tokenFilter
lastToken int
}

// Process the given SQL or No-SQL string so that the resulting one is properly altered. This
// function is generic and the behavior changes according to chosen TokenFilter implementations.
// The process calls all filters inside the []TokenFilter.
func (t *TokenConsumer) Process(in string) (string, error) {
// function is generic and the behavior changes according to chosen tokenFilter implementations.
// The process calls all filters inside the []tokenFilter.
func (t *tokenConsumer) Process(in string) (string, error) {
out := &bytes.Buffer{}
t.tokenizer.InStream.Reset(in)

Expand All @@ -150,7 +167,10 @@ func (t *TokenConsumer) Process(in string) (string, error) {

// apply all registered filters
for _, f := range t.filters {
token, buff = f.Filter(token, t.lastToken, buff)
if token, buff = f.Filter(token, t.lastToken, buff); token == LexError {
t.Reset()
return "", errors.New("the tokenizer was unable to process the string")
}
}

// write the resulting buffer
Expand Down Expand Up @@ -182,38 +202,32 @@ func (t *TokenConsumer) Process(in string) (string, error) {
}

// Reset restores the initial states for all components so that memory can be re-used
func (t *TokenConsumer) Reset() {
func (t *tokenConsumer) Reset() {
t.tokenizer.Reset()
for _, f := range t.filters {
f.Reset()
}
}

// NewTokenConsumer returns a new TokenConsumer capable to process SQL and No-SQL strings.
func NewTokenConsumer(filters []TokenFilter) *TokenConsumer {
return &TokenConsumer{
// newTokenConsumer returns a new tokenConsumer capable to process SQL and No-SQL strings.
func newTokenConsumer() *tokenConsumer {
return &tokenConsumer{
tokenizer: NewStringTokenizer(""),
filters: filters,
filters: []tokenFilter{
&discardFilter{},
&replaceFilter{},
&groupingFilter{},
},
}
}

// token consumer that will quantize the query with
// the given filters; this quantizer is used only
// for SQL and CQL strings
var tokenQuantizer = NewTokenConsumer(
[]TokenFilter{
&DiscardFilter{},
&ReplaceFilter{},
&GroupingFilter{},
})

// QuantizeSQL generates resource and sql.query meta for SQL spans
func (*Obfuscator) obfuscateSQL(span *model.Span) {
func (o *Obfuscator) obfuscateSQL(span *model.Span) {
if span.Resource == "" {
return
}

quantizedString, err := tokenQuantizer.Process(span.Resource)
quantizedString, err := o.sql.Process(span.Resource)
if err != nil || quantizedString == "" {
// if we have an error, the partially parsed SQL is discarded so that we don't pollute
// users resources. Here we provide more details to debug the problem.
Expand Down
34 changes: 15 additions & 19 deletions obfuscate/sql_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,18 @@ func TestSQLQuantizer(t *testing.T) {
`,
`SELECT t1.userid, t1.fullname, t1.firm_id, t2.firmname, t1.email, t1.location, t1.state, t1.phone, t1.url, DATE_FORMAT ( t1.lastmod, %m/%d/%Y %h:%i:%s ), t1.lastmod, t1.user_status, t1.pw_expire, DATE_FORMAT ( t1.pw_expire, %m/%d/%Y ), t1.addr1, t1.addr2, t1.zipcode, t1.office_id, t1.default_group, t3.firm_status, t1.title FROM userdata LEFT JOIN lawfirm_names ON t1.firm_id = t2.firm_id LEFT JOIN lawfirms ON t1.firm_id = t3.firm_id WHERE t1.userid = ?`,
},
{
`SELECT [b].[BlogId], [b].[Name]
FROM [Blogs] AS [b]
ORDER BY [b].[Name]`,
`SELECT [ b ] . [ BlogId ], [ b ] . [ Name ] FROM [ Blogs ] ORDER BY [ b ] . [ Name ]`,
},
{
`SELECT [b].[BlogId], [b].[Name]
FROM [Blogs] AS [b
ORDER BY [b].[Name]`,
`Non-parsable SQL query`,
},
}

for _, c := range cases {
Expand All @@ -361,14 +373,8 @@ func TestMultipleProcess(t *testing.T) {
},
}

filters := []TokenFilter{
&DiscardFilter{},
&ReplaceFilter{},
&GroupingFilter{},
}

// The consumer is the same between executions
consumer := NewTokenConsumer(filters)
consumer := newTokenConsumer()

for _, tc := range testCases {
output, err := consumer.Process(tc.query)
Expand All @@ -383,12 +389,7 @@ func TestConsumerError(t *testing.T) {
// Malformed SQL is not accepted and the outer component knows
// what to do with malformed SQL
input := "SELECT * FROM users WHERE users.id = '1 AND users.name = 'dog'"
filters := []TokenFilter{
&DiscardFilter{},
&ReplaceFilter{},
&GroupingFilter{},
}
consumer := NewTokenConsumer(filters)
consumer := newTokenConsumer()

output, err := consumer.Process(input)
assert.NotNil(err)
Expand All @@ -404,12 +405,7 @@ func BenchmarkTokenizer(b *testing.B) {
{"Escaping", `INSERT INTO delayed_jobs (attempts, created_at, failed_at, handler, last_error, locked_at, locked_by, priority, queue, run_at, updated_at) VALUES (0, '2016-12-04 17:09:59', NULL, '--- !ruby/object:Delayed::PerformableMethod\nobject: !ruby/object:Item\n store:\n - a simple string\n - an \'escaped \' string\n - another \'escaped\' string\n - 42\n string: a string with many \\\\\'escapes\\\\\'\nmethod_name: :show_store\nargs: []\n', NULL, NULL, NULL, 0, NULL, '2016-12-04 17:09:59', '2016-12-04 17:09:59')`},
{"Grouping", `INSERT INTO delayed_jobs (created_at, failed_at, handler) VALUES (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL)`},
}
filters := []TokenFilter{
&DiscardFilter{},
&ReplaceFilter{},
&GroupingFilter{},
}
consumer := NewTokenConsumer(filters)
consumer := newTokenConsumer()

for _, bm := range benchmarks {
b.Run(bm.name+"/"+strconv.Itoa(len(bm.query)), func(b *testing.B) {
Expand Down
5 changes: 5 additions & 0 deletions obfuscate/sql_tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ const (
// FilteredComma specifies that the token is a comma and was discarded by one
// of the filters.
FilteredComma = 57366

// FilteredBracketedIdentifier specifies that we are currently discarding
// a bracketed identifier (MSSQL).
// See issue https://github.com/DataDog/datadog-trace-agent/issues/475.
FilteredBracketedIdentifier = 57367
)

// Tokenizer is the struct used to generate SQL
Expand Down