Skip to content

Commit

Permalink
[Processors] Mime-Type Detection (elastic#22940)
Browse files Browse the repository at this point in the history
* Add mimetype processor

* Add mimetype detection for packetbeat

* Update changelog

* Rev go.sum

* Refactor for reusability and rename to detect_mime_type

* reformat imports

* update docs

* Update maxHeaderSize name and add comment on the fallback behavior
  • Loading branch information
Andrew Stucki authored Dec 8, 2020
1 parent 7c64f53 commit 5f52979
Show file tree
Hide file tree
Showing 16 changed files with 392 additions and 6 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.next.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
- Added support for wildcard fields and keyword fallback in beats setup commands. {pull}22521[22521]
- Fix polling node when it is not ready and monitor by hostname {pull}22666[22666]
- Improve equals check. {pull}22778[22778]
- Added "detect_mime_type" processor for detecting mime types {pull}22940[22940]

*Auditbeat*

Expand Down Expand Up @@ -870,6 +871,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
- Add support for overriding the published index on a per-protocol/flow basis. {pull}22134[22134]
- Change build process for x-pack distribution {pull}21979[21979]
- Tuned the internal queue size to reduce the chances of events being dropped. {pull}22650[22650]
- Add support for "http.request.mime_type" and "http.response.mime_type". {pull}22940[22940]

*Functionbeat*

Expand Down
4 changes: 2 additions & 2 deletions NOTICE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10142,11 +10142,11 @@ Contents of probable licence file $GOMODCACHE/github.com/gorhill/[email protected]

--------------------------------------------------------------------------------
Dependency : github.com/h2non/filetype
Version: v1.0.12
Version: v1.1.1-0.20201130172452-f60988ab73d5
Licence type (autodetected): MIT
--------------------------------------------------------------------------------

Contents of probable licence file $GOMODCACHE/github.com/h2non/filetype@v1.0.12/LICENSE:
Contents of probable licence file $GOMODCACHE/github.com/h2non/filetype@v1.1.1-0.20201130172452-f60988ab73d5/LICENSE:

The MIT License

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ require (
github.com/gorilla/mux v1.7.2 // indirect
github.com/gorilla/websocket v1.4.1 // indirect
github.com/grpc-ecosystem/grpc-gateway v1.13.0 // indirect
github.com/h2non/filetype v1.0.12
github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5
github.com/hashicorp/go-multierror v1.1.0
github.com/hashicorp/go-retryablehttp v0.6.6
github.com/hashicorp/golang-lru v0.5.2-0.20190520140433-59383c442f7d // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,8 @@ github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/ad
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
github.com/grpc-ecosystem/grpc-gateway v1.13.0 h1:sBDQoHXrOlfPobnKw69FIKa1wg9qsLLvvQ/Y19WtFgI=
github.com/grpc-ecosystem/grpc-gateway v1.13.0/go.mod h1:8XEsbTttt/W+VvjtQhLACqCisSPWTxCZ7sBRjU6iH9c=
github.com/h2non/filetype v1.0.12 h1:yHCsIe0y2cvbDARtJhGBTD2ecvqMSTvlIcph9En/Zao=
github.com/h2non/filetype v1.0.12/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY=
github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5 h1:xI88renBpIJws9OfEQq4Dng10OppnY5u9bTok/GDFEI=
github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY=
github.com/hashicorp/errwrap v0.0.0-20141028054710-7554cd9344ce/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
Expand Down
6 changes: 6 additions & 0 deletions libbeat/docs/processors-list.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ endif::[]
ifndef::no_decompress_gzip_field_processor[]
* <<decompress-gzip-field,`decompress_gzip_field`>>
endif::[]
ifndef::no_detect_mime_type_processor[]
* <<detect-mime-type,`detect_mime_type`>>
endif::[]
ifndef::no_dissect_processor[]
* <<dissect, `dissect`>>
endif::[]
Expand Down Expand Up @@ -168,6 +171,9 @@ endif::[]
ifndef::no_decompress_gzip_field_processor[]
include::{libbeat-processors-dir}/actions/docs/decompress_gzip_field.asciidoc[]
endif::[]
ifndef::no_detect_mime_type_processor[]
include::{libbeat-processors-dir}/actions/docs/detect_mime_type.asciidoc[]
endif::[]
ifndef::no_dissect_processor[]
include::{libbeat-processors-dir}/dissect/docs/dissect.asciidoc[]
endif::[]
Expand Down
76 changes: 76 additions & 0 deletions libbeat/mime/byte.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Licensed to Elasticsearch B.V. under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Elasticsearch B.V. licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package mime

import (
"encoding/json"
"encoding/xml"
"net/http"
"strings"

"github.com/h2non/filetype"
)

const (
// size for mime detection, office file
// detection requires ~8kb to detect properly
maxHeaderSize = 8192
)

// DetectBytes tries to detect a mime-type based off
// of a chunk of bytes passed into the function
func DetectBytes(data []byte) string {
header := data
if len(data) > maxHeaderSize {
header = data[:maxHeaderSize]
}
kind, err := filetype.Match(header)
if err == nil && kind != filetype.Unknown {
// we have a known filetype, return
return kind.MIME.Value
}
// if the above fails, try and sniff with http sniffing
netType := http.DetectContentType(header)
// try and parse any sort of text as json or xml
if strings.HasPrefix(netType, "text/plain") {
if detected := detectEncodedText(data); detected != "" {
return detected
}
}
// The fallback for http.DetectContentType is "application/octet-stream"
// meaning that if we see it, we were unable to determine the type and
// we just know we're dealing with a chunk of some sort of bytes. Rather
// than reporting the fallback, we'll just say we were unable to detect
// the type.
if netType == "application/octet-stream" {
return ""
}
return netType
}

func detectEncodedText(data []byte) string {
// figure out how to optimize this so we don't have to try and parse the whole payload
// every time
if json.Valid(data) {
return "application/json"
}
if xml.Unmarshal(data, new(interface{})) == nil {
return "text/xml"
}
return ""
}
88 changes: 88 additions & 0 deletions libbeat/mime/mime_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Licensed to Elasticsearch B.V. under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Elasticsearch B.V. licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package mime

import (
"encoding/hex"
"testing"

"github.com/stretchr/testify/require"
)

func TestMimeType(t *testing.T) {
tests := []struct {
name string
expectedType string
body string
}{
{
name: "html",
expectedType: "text/html; charset=utf-8",
body: "<html>Test</html>",
},
{
name: "pe",
expectedType: "application/vnd.microsoft.portable-executable",
body: convertToData(t, "4d5a90000300000004000000ffff"),
},
{
name: "elf",
expectedType: "application/x-executable",
body: convertToData(t, "7f454c460101010000000000000000000300030001000000f0dc01003400000080a318000000000034002000080028001e001d0001"),
},
{
name: "macho",
expectedType: "application/x-mach-binary",
body: convertToData(t, "cffaedfe0700000103000000020000001000000058050000850020000000000019000000480000005f5f504147455a45524f"),
},
{
name: "json",
expectedType: "application/json",
body: "{}",
},
{
name: "xml",
expectedType: "text/xml",
body: "<test></test>",
},
{
name: "text",
expectedType: "text/plain; charset=utf-8",
body: "Hello world!",
},
{
name: "png",
expectedType: "image/png",
body: convertToData(t, "89504e470d0a1a0a0000000d494844520000025800000258080200000031040f8b0000000467414d410000b18f0bfc610500"),
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
require.Equal(t, test.expectedType, Detect(test.body))
})
}
}

func convertToData(t *testing.T, sample string) string {
t.Helper()
decoded, err := hex.DecodeString(sample)
if err != nil {
t.Fatal(err)
}
return string(decoded)
}
24 changes: 24 additions & 0 deletions libbeat/mime/string.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Licensed to Elasticsearch B.V. under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Elasticsearch B.V. licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package mime

// Detect tries to detect a mime-type based off
// of a byte string passed into the function
func Detect(data string) string {
return DetectBytes([]byte(data))
}
75 changes: 75 additions & 0 deletions libbeat/processors/actions/detect_mime_type.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Licensed to Elasticsearch B.V. under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Elasticsearch B.V. licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package actions

import (
"fmt"

"github.com/pkg/errors"

"github.com/elastic/beats/v7/libbeat/beat"
"github.com/elastic/beats/v7/libbeat/common"
"github.com/elastic/beats/v7/libbeat/mime"
"github.com/elastic/beats/v7/libbeat/processors"
"github.com/elastic/beats/v7/libbeat/processors/checks"
)

func init() {
processors.RegisterPlugin("detect_mime_type",
checks.ConfigChecked(NewDetectMimeType,
checks.RequireFields("field", "target"),
checks.AllowedFields("field", "target")))
}

type mimeTypeProcessor struct {
Field string `config:"field"`
Target string `config:"target"`
}

// NewDetectMimeType constructs a new mime processor.
func NewDetectMimeType(cfg *common.Config) (processors.Processor, error) {
mimeType := &mimeTypeProcessor{}
if err := cfg.Unpack(mimeType); err != nil {
return nil, errors.Wrapf(err, "fail to unpack the detect_mime_type configuration")
}

return mimeType, nil
}

func (m *mimeTypeProcessor) Run(event *beat.Event) (*beat.Event, error) {
valI, err := event.GetValue(m.Field)
if err != nil {
// doesn't have the required fieldd value to analyze
return event, nil
}
val, _ := valI.(string)
if val == "" {
// wrong type or not set
return event, nil
}
if mimeType := mime.Detect(val); mimeType != "" {
event.Fields.DeepUpdate(common.MapStr{
m.Target: mimeType,
})
}
return event, nil
}

func (m *mimeTypeProcessor) String() string {
return fmt.Sprintf("detect_mime_type=%+v->%+v", m.Field, m.Target)
}
62 changes: 62 additions & 0 deletions libbeat/processors/actions/detect_mime_type_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Licensed to Elasticsearch B.V. under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Elasticsearch B.V. licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package actions

import (
"testing"

"github.com/stretchr/testify/require"

"github.com/elastic/beats/v7/libbeat/beat"
"github.com/elastic/beats/v7/libbeat/common"
)

func TestMimeTypeFromTo(t *testing.T) {
evt := beat.Event{
Fields: common.MapStr{
"foo.bar.baz": "hello world!",
},
}
p, err := NewDetectMimeType(common.MustNewConfigFrom(map[string]interface{}{
"field": "foo.bar.baz",
"target": "bar.baz.zoiks",
}))
require.NoError(t, err)
observed, err := p.Run(&evt)
require.NoError(t, err)
enriched, err := observed.Fields.GetValue("bar.baz.zoiks")
require.NoError(t, err)
require.Equal(t, "text/plain; charset=utf-8", enriched)
}

func TestMimeTypeTestNoMatch(t *testing.T) {
evt := beat.Event{
Fields: common.MapStr{
"foo.bar.baz": string([]byte{0, 0}),
},
}
p, err := NewDetectMimeType(common.MustNewConfigFrom(map[string]interface{}{
"field": "foo.bar.baz",
"target": "bar.baz.zoiks",
}))
require.NoError(t, err)
observed, err := p.Run(&evt)
require.NoError(t, err)
hasKey, _ := observed.Fields.HasKey("bar.baz.zoiks")
require.False(t, hasKey)
}
Loading

0 comments on commit 5f52979

Please sign in to comment.