forked from gane5hvarma/panther
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
CLI tool that infers Custom Logs schema out of samples (#2194)
- Loading branch information
Kostas Papageorgiou
authored
Dec 11, 2020
1 parent
961e60f
commit 784fac6
Showing
7 changed files
with
365 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
package customlogs | ||
|
||
/** | ||
* Panther is a Cloud-Native SIEM for the Modern Security Team. | ||
* Copyright (C) 2020 Panther Labs Inc | ||
* | ||
* This program is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU Affero General Public License as | ||
* published by the Free Software Foundation, either version 3 of the | ||
* License, or (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU Affero General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Affero General Public License | ||
* along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
import ( | ||
"bufio" | ||
"bytes" | ||
"flag" | ||
"fmt" | ||
"io" | ||
"os" | ||
"strings" | ||
|
||
jsoniter "github.com/json-iterator/go" | ||
"github.com/pkg/errors" | ||
"go.uber.org/zap" | ||
"gopkg.in/yaml.v2" | ||
|
||
"github.com/panther-labs/panther/internal/log_analysis/log_processor/customlogs" | ||
"github.com/panther-labs/panther/internal/log_analysis/log_processor/logschema" | ||
"github.com/panther-labs/panther/internal/log_analysis/log_processor/logtypes" | ||
) | ||
|
||
type InferOpts struct { | ||
SkipTest *bool | ||
} | ||
|
||
var inferJsoniter = jsoniter.Config{ | ||
UseNumber: true, | ||
}.Froze() | ||
|
||
// Infers a schema given a sample of logs | ||
func Infer(logger *zap.Logger, opts *InferOpts) { | ||
inputFiles := flag.Args() | ||
if len(inputFiles) == 0 { | ||
logger.Fatal("You need to specify at least one file") | ||
flag.Usage() | ||
} | ||
|
||
var valueSchema *logschema.ValueSchema | ||
var err error | ||
for _, file := range inputFiles { | ||
valueSchema, err = inferFromFile(valueSchema, file) | ||
if err != nil { | ||
logger.Fatal("failed to generate schema", zap.Error(err)) | ||
} | ||
} | ||
|
||
// Remove empty objects | ||
valueSchema = valueSchema.NonEmpty() | ||
|
||
if !*opts.SkipTest { | ||
// In order to validate that the schema generated is correct, | ||
// run the parser against the logs, fail in case of error | ||
for _, file := range inputFiles { | ||
if err = validateSchema(valueSchema, file); err != nil { | ||
logger.Fatal("failed while testing schema with file. You can specify '-skip-test' argument to skip this step", zap.Error(err)) | ||
} | ||
} | ||
} | ||
|
||
schema, err := yaml.Marshal(logschema.Schema{Version: 0, Fields: valueSchema.Fields}) | ||
if err != nil { | ||
logger.Fatal("failed to marshal schema", zap.Error(err)) | ||
} | ||
fmt.Println(string(schema)) | ||
} | ||
|
||
func inferFromFile(root *logschema.ValueSchema, file string) (*logschema.ValueSchema, error) { | ||
f, err := os.Open(file) | ||
if err != nil { | ||
return nil, err | ||
} | ||
defer f.Close() // nolint: errcheck | ||
|
||
reader := bufio.NewReader(f) | ||
lineNum := 0 | ||
run := true | ||
for run { | ||
lineNum++ | ||
line, err := reader.ReadBytes('\n') | ||
if err != nil { | ||
if err == io.EOF { | ||
// Don't go through more lines, but make sure to process existing line | ||
run = false | ||
} else { | ||
return root, errors.Wrap(err, "failed while reading file") | ||
} | ||
} | ||
line = bytes.TrimSpace(line) | ||
if len(line) == 0 { | ||
continue | ||
} | ||
|
||
var data map[string]interface{} | ||
if err = inferJsoniter.Unmarshal(line, &data); err != nil { | ||
return nil, errors.Wrapf(err, "failed to parse line [%d] as JSON", lineNum) | ||
} | ||
lineObject := logschema.InferJSONValueSchema(data) | ||
if lineObject.Type != logschema.TypeObject { | ||
return nil, errors.New("invalid schema") | ||
} | ||
root = logschema.Merge(root, lineObject) | ||
} | ||
|
||
return root, nil | ||
} | ||
|
||
// Validates the schema. It generates a parser of the provided schema | ||
// and tries to parse the contents of the file. | ||
func validateSchema(valueSchema *logschema.ValueSchema, file string) error { | ||
desc := logtypes.Desc{ | ||
Name: "Custom.Test", | ||
Description: "Custom log test schema", | ||
ReferenceURL: "-", | ||
} | ||
schema := &logschema.Schema{Version: 0, Fields: valueSchema.Fields} | ||
entry, err := customlogs.Build(desc, schema) | ||
if err != nil { | ||
validationErrors := logschema.ValidationErrors(err) | ||
if len(validationErrors) > 0 { | ||
return errors.New(validationErrors[0].String()) | ||
} | ||
return err | ||
} | ||
parser, err := entry.NewParser(nil) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
fd, err := os.Open(file) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
reader := bufio.NewReader(fd) | ||
run := true | ||
for run { | ||
line, err := reader.ReadString('\n') | ||
if err != nil { | ||
if err == io.EOF { | ||
// Don't go through more lines, but make sure to process existing line | ||
run = false | ||
} else { | ||
return errors.Wrap(err, "failed while reading file") | ||
} | ||
} | ||
line = strings.TrimSpace(line) | ||
if len(line) == 0 { | ||
continue | ||
} | ||
|
||
if _, err = parser.ParseLog(line); err != nil { | ||
return err | ||
} | ||
} | ||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
package customlogs | ||
|
||
/** | ||
* Panther is a Cloud-Native SIEM for the Modern Security Team. | ||
* Copyright (C) 2020 Panther Labs Inc | ||
* | ||
* This program is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU Affero General Public License as | ||
* published by the Free Software Foundation, either version 3 of the | ||
* License, or (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU Affero General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Affero General Public License | ||
* along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
import ( | ||
"fmt" | ||
"io/ioutil" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
"gopkg.in/yaml.v2" | ||
) | ||
|
||
func TestProcessLine(t *testing.T) { | ||
schema, err := inferFromFile(nil, "./testdata/sample_1.jsonl") | ||
schema = schema.NonEmpty() | ||
assert.NoError(t, err) | ||
fd, err := ioutil.ReadFile("./testdata/schema_1.yml") | ||
assert.NoError(t, err) | ||
|
||
marshalled, err := yaml.Marshal(schema) | ||
assert.NoError(t, err) | ||
fmt.Println(string(marshalled)) | ||
assert.YAMLEq(t, string(fd), string(marshalled)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{"string" : "value", "required_string": "value", "boolean": true, "boolean_as_string": "true", "float": 0.0, "time": "2020-01-01T00:00:00Z", "float_as_string": "0.0", "int_as_string": "0", "not_int_as_string": "0"} | ||
{"required_string": "value", "boolean_as_string": "false", "int": 0, "object": {"string": ""}, "array": [1,2]} | ||
{"required_string": "value", "not_int_as_string": "value", "string" : null, "object": {}, "empty_object": {"empty": {"more_empty": {}}}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# Panther is a Cloud-Native SIEM for the Modern Security Team. | ||
# Copyright (C) 2020 Panther Labs Inc | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU Affero General Public License as | ||
# published by the Free Software Foundation, either version 3 of the | ||
# License, or (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU Affero General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU Affero General Public License | ||
# along with this program. If not, see <https://www.gnu.org/licenses/>. | ||
|
||
type: object | ||
fields: | ||
- name: boolean | ||
type: boolean | ||
- name: float | ||
type: float | ||
- name: float_as_string | ||
type: float | ||
- name: int_as_string | ||
type: bigint | ||
- name: string | ||
type: string | ||
- name: time | ||
type: timestamp | ||
timeFormat: rfc3339 | ||
- name: array | ||
type: array | ||
element: | ||
type: bigint | ||
- name: int | ||
type: bigint | ||
- name: boolean_as_string | ||
type: boolean | ||
- name: not_int_as_string | ||
type: string | ||
- name: object | ||
type: object | ||
fields: | ||
- name: string | ||
type: string | ||
- name: required_string | ||
type: string | ||
required: true |
Oops, something went wrong.