From 784fac68a974e804a5cf8d586e199569bdb18457 Mon Sep 17 00:00:00 2001 From: Kostas Papageorgiou Date: Fri, 11 Dec 2020 15:54:33 +0200 Subject: [PATCH] CLI tool that infers Custom Logs schema out of samples (#2194) --- cmd/devtools/customlogs/customlogs/infer.go | 174 ++++++++++++++++++ .../customlogs/customlogs/infer_test.go | 41 +++++ .../customlogs/{main.go => test.go} | 27 +-- .../customlogs/testdata/sample_1.jsonl | 3 + .../customlogs/testdata/schema_1.yml | 49 +++++ cmd/devtools/customlogs/main.go | 80 ++++++++ .../log_processor/logschema/infer.go | 20 +- 7 files changed, 365 insertions(+), 29 deletions(-) create mode 100644 cmd/devtools/customlogs/customlogs/infer.go create mode 100644 cmd/devtools/customlogs/customlogs/infer_test.go rename cmd/devtools/customlogs/customlogs/{main.go => test.go} (85%) create mode 100644 cmd/devtools/customlogs/customlogs/testdata/sample_1.jsonl create mode 100644 cmd/devtools/customlogs/customlogs/testdata/schema_1.yml create mode 100644 cmd/devtools/customlogs/main.go diff --git a/cmd/devtools/customlogs/customlogs/infer.go b/cmd/devtools/customlogs/customlogs/infer.go new file mode 100644 index 0000000000..ff11aadcb1 --- /dev/null +++ b/cmd/devtools/customlogs/customlogs/infer.go @@ -0,0 +1,174 @@ +package customlogs + +/** + * Panther is a Cloud-Native SIEM for the Modern Security Team. + * Copyright (C) 2020 Panther Labs Inc + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +import ( + "bufio" + "bytes" + "flag" + "fmt" + "io" + "os" + "strings" + + jsoniter "github.com/json-iterator/go" + "github.com/pkg/errors" + "go.uber.org/zap" + "gopkg.in/yaml.v2" + + "github.com/panther-labs/panther/internal/log_analysis/log_processor/customlogs" + "github.com/panther-labs/panther/internal/log_analysis/log_processor/logschema" + "github.com/panther-labs/panther/internal/log_analysis/log_processor/logtypes" +) + +type InferOpts struct { + SkipTest *bool +} + +var inferJsoniter = jsoniter.Config{ + UseNumber: true, +}.Froze() + +// Infers a schema given a sample of logs +func Infer(logger *zap.Logger, opts *InferOpts) { + inputFiles := flag.Args() + if len(inputFiles) == 0 { + logger.Fatal("You need to specify at least one file") + flag.Usage() + } + + var valueSchema *logschema.ValueSchema + var err error + for _, file := range inputFiles { + valueSchema, err = inferFromFile(valueSchema, file) + if err != nil { + logger.Fatal("failed to generate schema", zap.Error(err)) + } + } + + // Remove empty objects + valueSchema = valueSchema.NonEmpty() + + if !*opts.SkipTest { + // In order to validate that the schema generated is correct, + // run the parser against the logs, fail in case of error + for _, file := range inputFiles { + if err = validateSchema(valueSchema, file); err != nil { + logger.Fatal("failed while testing schema with file. You can specify '-skip-test' argument to skip this step", zap.Error(err)) + } + } + } + + schema, err := yaml.Marshal(logschema.Schema{Version: 0, Fields: valueSchema.Fields}) + if err != nil { + logger.Fatal("failed to marshal schema", zap.Error(err)) + } + fmt.Println(string(schema)) +} + +func inferFromFile(root *logschema.ValueSchema, file string) (*logschema.ValueSchema, error) { + f, err := os.Open(file) + if err != nil { + return nil, err + } + defer f.Close() // nolint: errcheck + + reader := bufio.NewReader(f) + lineNum := 0 + run := true + for run { + lineNum++ + line, err := reader.ReadBytes('\n') + if err != nil { + if err == io.EOF { + // Don't go through more lines, but make sure to process existing line + run = false + } else { + return root, errors.Wrap(err, "failed while reading file") + } + } + line = bytes.TrimSpace(line) + if len(line) == 0 { + continue + } + + var data map[string]interface{} + if err = inferJsoniter.Unmarshal(line, &data); err != nil { + return nil, errors.Wrapf(err, "failed to parse line [%d] as JSON", lineNum) + } + lineObject := logschema.InferJSONValueSchema(data) + if lineObject.Type != logschema.TypeObject { + return nil, errors.New("invalid schema") + } + root = logschema.Merge(root, lineObject) + } + + return root, nil +} + +// Validates the schema. It generates a parser of the provided schema +// and tries to parse the contents of the file. +func validateSchema(valueSchema *logschema.ValueSchema, file string) error { + desc := logtypes.Desc{ + Name: "Custom.Test", + Description: "Custom log test schema", + ReferenceURL: "-", + } + schema := &logschema.Schema{Version: 0, Fields: valueSchema.Fields} + entry, err := customlogs.Build(desc, schema) + if err != nil { + validationErrors := logschema.ValidationErrors(err) + if len(validationErrors) > 0 { + return errors.New(validationErrors[0].String()) + } + return err + } + parser, err := entry.NewParser(nil) + if err != nil { + return err + } + + fd, err := os.Open(file) + if err != nil { + return err + } + + reader := bufio.NewReader(fd) + run := true + for run { + line, err := reader.ReadString('\n') + if err != nil { + if err == io.EOF { + // Don't go through more lines, but make sure to process existing line + run = false + } else { + return errors.Wrap(err, "failed while reading file") + } + } + line = strings.TrimSpace(line) + if len(line) == 0 { + continue + } + + if _, err = parser.ParseLog(line); err != nil { + return err + } + } + return nil +} diff --git a/cmd/devtools/customlogs/customlogs/infer_test.go b/cmd/devtools/customlogs/customlogs/infer_test.go new file mode 100644 index 0000000000..b38455fda5 --- /dev/null +++ b/cmd/devtools/customlogs/customlogs/infer_test.go @@ -0,0 +1,41 @@ +package customlogs + +/** + * Panther is a Cloud-Native SIEM for the Modern Security Team. + * Copyright (C) 2020 Panther Labs Inc + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +import ( + "fmt" + "io/ioutil" + "testing" + + "github.com/stretchr/testify/assert" + "gopkg.in/yaml.v2" +) + +func TestProcessLine(t *testing.T) { + schema, err := inferFromFile(nil, "./testdata/sample_1.jsonl") + schema = schema.NonEmpty() + assert.NoError(t, err) + fd, err := ioutil.ReadFile("./testdata/schema_1.yml") + assert.NoError(t, err) + + marshalled, err := yaml.Marshal(schema) + assert.NoError(t, err) + fmt.Println(string(marshalled)) + assert.YAMLEq(t, string(fd), string(marshalled)) +} diff --git a/cmd/devtools/customlogs/customlogs/main.go b/cmd/devtools/customlogs/customlogs/test.go similarity index 85% rename from cmd/devtools/customlogs/customlogs/main.go rename to cmd/devtools/customlogs/customlogs/test.go index bc25080212..da990c9d19 100644 --- a/cmd/devtools/customlogs/customlogs/main.go +++ b/cmd/devtools/customlogs/customlogs/test.go @@ -1,4 +1,4 @@ -package main +package customlogs /** * Panther is a Cloud-Native SIEM for the Modern Security Team. @@ -24,14 +24,12 @@ import ( "fmt" "io" "io/ioutil" - "log" "os" "strings" "go.uber.org/zap" "gopkg.in/yaml.v2" - "github.com/panther-labs/panther/cmd/opstools" "github.com/panther-labs/panther/internal/log_analysis/log_processor/customlogs" "github.com/panther-labs/panther/internal/log_analysis/log_processor/logschema" "github.com/panther-labs/panther/internal/log_analysis/log_processor/logtypes" @@ -39,30 +37,17 @@ import ( "github.com/panther-labs/panther/internal/log_analysis/log_processor/parsers" ) -// CLI flags -var opts = struct { +type TestOpts struct { Schema *string Output *string -}{ - Schema: flag.String("s", "", "Schema file"), - Output: flag.String("o", "", "Write parsed results to file (defaults to stdout)"), } -func main() { - opstools.SetUsage(`-s SCHEMA_FILE [-o OUTPUT_FILE] [INPUT_FILES...]`) - flag.Parse() - loggerConfig := zap.NewDevelopmentConfig() - loggerConfig.DisableStacktrace = true - loggerConfig.DisableCaller = true - z, err := loggerConfig.Build() - if err != nil { - log.Fatalln("failed to start logger: ", err.Error()) - } - logger := z.Sugar() +// Test validates a log schema against a sample of logs +func Test(logger *zap.SugaredLogger, opts *TestOpts) { schemaFile := *opts.Schema if schemaFile == "" { flag.Usage() - log.Fatal("no schema file provided") + logger.Fatal("no schema file provided") } schemaData, err := ioutil.ReadFile(schemaFile) if err != nil { @@ -81,7 +66,7 @@ func main() { if err != nil { validationErrors := logschema.ValidationErrors(err) if len(validationErrors) > 0 { - logger.Error("Schema validation failed:") + logger.Error("File validation failed:") for _, e := range validationErrors { logger.Errorf(" - %s", e.String()) } diff --git a/cmd/devtools/customlogs/customlogs/testdata/sample_1.jsonl b/cmd/devtools/customlogs/customlogs/testdata/sample_1.jsonl new file mode 100644 index 0000000000..4ef5b1c801 --- /dev/null +++ b/cmd/devtools/customlogs/customlogs/testdata/sample_1.jsonl @@ -0,0 +1,3 @@ +{"string" : "value", "required_string": "value", "boolean": true, "boolean_as_string": "true", "float": 0.0, "time": "2020-01-01T00:00:00Z", "float_as_string": "0.0", "int_as_string": "0", "not_int_as_string": "0"} +{"required_string": "value", "boolean_as_string": "false", "int": 0, "object": {"string": ""}, "array": [1,2]} +{"required_string": "value", "not_int_as_string": "value", "string" : null, "object": {}, "empty_object": {"empty": {"more_empty": {}}}} diff --git a/cmd/devtools/customlogs/customlogs/testdata/schema_1.yml b/cmd/devtools/customlogs/customlogs/testdata/schema_1.yml new file mode 100644 index 0000000000..3ad4ee4384 --- /dev/null +++ b/cmd/devtools/customlogs/customlogs/testdata/schema_1.yml @@ -0,0 +1,49 @@ +# Panther is a Cloud-Native SIEM for the Modern Security Team. +# Copyright (C) 2020 Panther Labs Inc +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +type: object +fields: + - name: boolean + type: boolean + - name: float + type: float + - name: float_as_string + type: float + - name: int_as_string + type: bigint + - name: string + type: string + - name: time + type: timestamp + timeFormat: rfc3339 + - name: array + type: array + element: + type: bigint + - name: int + type: bigint + - name: boolean_as_string + type: boolean + - name: not_int_as_string + type: string + - name: object + type: object + fields: + - name: string + type: string + - name: required_string + type: string + required: true diff --git a/cmd/devtools/customlogs/main.go b/cmd/devtools/customlogs/main.go new file mode 100644 index 0000000000..dae6c58dec --- /dev/null +++ b/cmd/devtools/customlogs/main.go @@ -0,0 +1,80 @@ +package main + +/** + * Panther is a Cloud-Native SIEM for the Modern Security Team. + * Copyright (C) 2020 Panther Labs Inc + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +import ( + "flag" + "log" + "os" + "strings" + + "go.uber.org/zap" + + "github.com/panther-labs/panther/cmd/devtools/customlogs/customlogs" + "github.com/panther-labs/panther/cmd/opstools" +) + +// CLI commands +const testCmd = "test" +const inferCmd = "infer" + +func main() { + opstools.SetUsage(strings.Join([]string{testCmd, inferCmd}, ",")) + + loggerConfig := zap.NewDevelopmentConfig() + loggerConfig.DisableStacktrace = true + loggerConfig.DisableCaller = true + z, err := loggerConfig.Build() + if err != nil { + log.Fatalln("failed to start logger: ", err.Error()) + } + logger := z.Sugar() + + if len(os.Args) < 2 { + flag.Usage() + logger.Fatalf("Need to provide one command") + } + + switch cmd := os.Args[1]; cmd { + case testCmd: + opstools.SetUsage(`-s SCHEMA_FILE [-o OUTPUT_FILE] [INPUT_FILES...]`) + opts := &customlogs.TestOpts{ + Schema: flag.String("s", "", "Schema file"), + Output: flag.String("o", "", "Write parsed results to file (defaults to stdout)"), + } + if err := flag.CommandLine.Parse(os.Args[2:]); err != nil { + logger.Fatalf("failed to parse command line arguments") + flag.Usage() + } + customlogs.Test(logger, opts) + case inferCmd: + opstools.SetUsage(`[INPUT_FILES...]`) + opts := &customlogs.InferOpts{ + SkipTest: flag.Bool("skip-test", false, "Skips testing the schema against the logs"), + } + if err := flag.CommandLine.Parse(os.Args[2:]); err != nil { + logger.Fatalf("failed to parse command line arguments") + flag.Usage() + } + customlogs.Infer(logger.Desugar(), opts) + default: + logger.Fatalf("Invalid command [%s]", cmd) + flag.Usage() + } +} diff --git a/internal/log_analysis/log_processor/logschema/infer.go b/internal/log_analysis/log_processor/logschema/infer.go index 8c6fe2e4af..c321956c3d 100644 --- a/internal/log_analysis/log_processor/logschema/infer.go +++ b/internal/log_analysis/log_processor/logschema/infer.go @@ -22,6 +22,7 @@ import ( "encoding/json" "net" "net/url" + "sort" "strconv" "time" @@ -51,6 +52,9 @@ func InferJSONValueSchema(x interface{}) *ValueSchema { ValueSchema: *vs, }) } + sort.Slice(fields, func(i, j int) bool { + return fields[i].Name < fields[j].Name + }) return &ValueSchema{ Type: TypeObject, Fields: fields, @@ -86,22 +90,22 @@ func InferJSONValueSchema(x interface{}) *ValueSchema { } func inferString(s string) *ValueSchema { - if _, err := json.Number(s).Int64(); err != nil { + if _, err := json.Number(s).Int64(); err == nil { return &ValueSchema{ Type: TypeBigInt, } } - if _, err := json.Number(s).Float64(); err != nil { + if _, err := json.Number(s).Float64(); err == nil { return &ValueSchema{ Type: TypeFloat, } } - if _, err := strconv.ParseBool(s); err != nil { + if _, err := strconv.ParseBool(s); err == nil { return &ValueSchema{ Type: TypeBoolean, } } - if _, err := time.Parse(time.RFC3339, s); err != nil { + if _, err := time.Parse(time.RFC3339, s); err == nil { return &ValueSchema{ Type: TypeTimestamp, TimeFormat: "rfc3339", @@ -117,7 +121,7 @@ func inferIndicators(s string) []string { if ip := net.ParseIP(s); ip != nil { return []string{"ip"} } - if _, err := url.Parse(s); err == nil { + if u, err := url.Parse(s); err == nil && (u.Scheme == "http" || u.Scheme == "https") { return []string{"url"} } if _, err := arn.Parse(s); err == nil { @@ -133,9 +137,6 @@ func (v *ValueSchema) NonEmpty() *ValueSchema { } switch v.Type { case TypeObject: - if v.Fields == nil { - return nil - } fields := make([]FieldSchema, 0, len(v.Fields)) for _, f := range v.Fields { if v := f.ValueSchema.NonEmpty(); v != nil { @@ -143,6 +144,9 @@ func (v *ValueSchema) NonEmpty() *ValueSchema { fields = append(fields, f) } } + if len(fields) == 0 { + return nil + } return &ValueSchema{ Type: TypeObject, Fields: fields,