From 5f5297922f91859660e6e46266cd3896dfb06c86 Mon Sep 17 00:00:00 2001 From: Andrew Stucki Date: Mon, 7 Dec 2020 20:02:13 -0500 Subject: [PATCH] [Processors] Mime-Type Detection (#22940) * Add mimetype processor * Add mimetype detection for packetbeat * Update changelog * Rev go.sum * Refactor for reusability and rename to detect_mime_type * reformat imports * update docs * Update maxHeaderSize name and add comment on the fallback behavior --- CHANGELOG.next.asciidoc | 2 + NOTICE.txt | 4 +- go.mod | 2 +- go.sum | 4 +- libbeat/docs/processors-list.asciidoc | 6 ++ libbeat/mime/byte.go | 76 ++++++++++++++++ libbeat/mime/mime_test.go | 88 +++++++++++++++++++ libbeat/mime/string.go | 24 +++++ .../processors/actions/detect_mime_type.go | 75 ++++++++++++++++ .../actions/detect_mime_type_test.go | 62 +++++++++++++ .../actions/docs/detect_mime_type.asciidoc | 23 +++++ packetbeat/_meta/config/processors.yml.tmpl | 6 ++ packetbeat/packetbeat.yml | 6 ++ .../tests/system/config/packetbeat.yml.j2 | 11 ++- .../tests/system/test_0063_http_body.py | 3 + x-pack/packetbeat/packetbeat.yml | 6 ++ 16 files changed, 392 insertions(+), 6 deletions(-) create mode 100644 libbeat/mime/byte.go create mode 100644 libbeat/mime/mime_test.go create mode 100644 libbeat/mime/string.go create mode 100644 libbeat/processors/actions/detect_mime_type.go create mode 100644 libbeat/processors/actions/detect_mime_type_test.go create mode 100644 libbeat/processors/actions/docs/detect_mime_type.asciidoc diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 0b8d376c563..819fd57d2c1 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -540,6 +540,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d - Added support for wildcard fields and keyword fallback in beats setup commands. {pull}22521[22521] - Fix polling node when it is not ready and monitor by hostname {pull}22666[22666] - Improve equals check. {pull}22778[22778] +- Added "detect_mime_type" processor for detecting mime types {pull}22940[22940] *Auditbeat* @@ -870,6 +871,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d - Add support for overriding the published index on a per-protocol/flow basis. {pull}22134[22134] - Change build process for x-pack distribution {pull}21979[21979] - Tuned the internal queue size to reduce the chances of events being dropped. {pull}22650[22650] +- Add support for "http.request.mime_type" and "http.response.mime_type". {pull}22940[22940] *Functionbeat* diff --git a/NOTICE.txt b/NOTICE.txt index 04001e312bb..0106cf4bc51 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -10142,11 +10142,11 @@ Contents of probable licence file $GOMODCACHE/github.com/gorhill/cronexpr@v0.0.0 -------------------------------------------------------------------------------- Dependency : github.com/h2non/filetype -Version: v1.0.12 +Version: v1.1.1-0.20201130172452-f60988ab73d5 Licence type (autodetected): MIT -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/h2non/filetype@v1.0.12/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/h2non/filetype@v1.1.1-0.20201130172452-f60988ab73d5/LICENSE: The MIT License diff --git a/go.mod b/go.mod index 83c39ae0a20..f29e3b16382 100644 --- a/go.mod +++ b/go.mod @@ -97,7 +97,7 @@ require ( github.com/gorilla/mux v1.7.2 // indirect github.com/gorilla/websocket v1.4.1 // indirect github.com/grpc-ecosystem/grpc-gateway v1.13.0 // indirect - github.com/h2non/filetype v1.0.12 + github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5 github.com/hashicorp/go-multierror v1.1.0 github.com/hashicorp/go-retryablehttp v0.6.6 github.com/hashicorp/golang-lru v0.5.2-0.20190520140433-59383c442f7d // indirect diff --git a/go.sum b/go.sum index fb1b0b64c0d..f2fac60f077 100644 --- a/go.sum +++ b/go.sum @@ -416,8 +416,8 @@ github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/ad github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/grpc-ecosystem/grpc-gateway v1.13.0 h1:sBDQoHXrOlfPobnKw69FIKa1wg9qsLLvvQ/Y19WtFgI= github.com/grpc-ecosystem/grpc-gateway v1.13.0/go.mod h1:8XEsbTttt/W+VvjtQhLACqCisSPWTxCZ7sBRjU6iH9c= -github.com/h2non/filetype v1.0.12 h1:yHCsIe0y2cvbDARtJhGBTD2ecvqMSTvlIcph9En/Zao= -github.com/h2non/filetype v1.0.12/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY= +github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5 h1:xI88renBpIJws9OfEQq4Dng10OppnY5u9bTok/GDFEI= +github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY= github.com/hashicorp/errwrap v0.0.0-20141028054710-7554cd9344ce/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= diff --git a/libbeat/docs/processors-list.asciidoc b/libbeat/docs/processors-list.asciidoc index 5dd95e2e3d5..89e78ca24ad 100644 --- a/libbeat/docs/processors-list.asciidoc +++ b/libbeat/docs/processors-list.asciidoc @@ -62,6 +62,9 @@ endif::[] ifndef::no_decompress_gzip_field_processor[] * <> endif::[] +ifndef::no_detect_mime_type_processor[] +* <> +endif::[] ifndef::no_dissect_processor[] * <> endif::[] @@ -168,6 +171,9 @@ endif::[] ifndef::no_decompress_gzip_field_processor[] include::{libbeat-processors-dir}/actions/docs/decompress_gzip_field.asciidoc[] endif::[] +ifndef::no_detect_mime_type_processor[] +include::{libbeat-processors-dir}/actions/docs/detect_mime_type.asciidoc[] +endif::[] ifndef::no_dissect_processor[] include::{libbeat-processors-dir}/dissect/docs/dissect.asciidoc[] endif::[] diff --git a/libbeat/mime/byte.go b/libbeat/mime/byte.go new file mode 100644 index 00000000000..c8be7def361 --- /dev/null +++ b/libbeat/mime/byte.go @@ -0,0 +1,76 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package mime + +import ( + "encoding/json" + "encoding/xml" + "net/http" + "strings" + + "github.com/h2non/filetype" +) + +const ( + // size for mime detection, office file + // detection requires ~8kb to detect properly + maxHeaderSize = 8192 +) + +// DetectBytes tries to detect a mime-type based off +// of a chunk of bytes passed into the function +func DetectBytes(data []byte) string { + header := data + if len(data) > maxHeaderSize { + header = data[:maxHeaderSize] + } + kind, err := filetype.Match(header) + if err == nil && kind != filetype.Unknown { + // we have a known filetype, return + return kind.MIME.Value + } + // if the above fails, try and sniff with http sniffing + netType := http.DetectContentType(header) + // try and parse any sort of text as json or xml + if strings.HasPrefix(netType, "text/plain") { + if detected := detectEncodedText(data); detected != "" { + return detected + } + } + // The fallback for http.DetectContentType is "application/octet-stream" + // meaning that if we see it, we were unable to determine the type and + // we just know we're dealing with a chunk of some sort of bytes. Rather + // than reporting the fallback, we'll just say we were unable to detect + // the type. + if netType == "application/octet-stream" { + return "" + } + return netType +} + +func detectEncodedText(data []byte) string { + // figure out how to optimize this so we don't have to try and parse the whole payload + // every time + if json.Valid(data) { + return "application/json" + } + if xml.Unmarshal(data, new(interface{})) == nil { + return "text/xml" + } + return "" +} diff --git a/libbeat/mime/mime_test.go b/libbeat/mime/mime_test.go new file mode 100644 index 00000000000..e4742fb9cfc --- /dev/null +++ b/libbeat/mime/mime_test.go @@ -0,0 +1,88 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package mime + +import ( + "encoding/hex" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestMimeType(t *testing.T) { + tests := []struct { + name string + expectedType string + body string + }{ + { + name: "html", + expectedType: "text/html; charset=utf-8", + body: "Test", + }, + { + name: "pe", + expectedType: "application/vnd.microsoft.portable-executable", + body: convertToData(t, "4d5a90000300000004000000ffff"), + }, + { + name: "elf", + expectedType: "application/x-executable", + body: convertToData(t, "7f454c460101010000000000000000000300030001000000f0dc01003400000080a318000000000034002000080028001e001d0001"), + }, + { + name: "macho", + expectedType: "application/x-mach-binary", + body: convertToData(t, "cffaedfe0700000103000000020000001000000058050000850020000000000019000000480000005f5f504147455a45524f"), + }, + { + name: "json", + expectedType: "application/json", + body: "{}", + }, + { + name: "xml", + expectedType: "text/xml", + body: "", + }, + { + name: "text", + expectedType: "text/plain; charset=utf-8", + body: "Hello world!", + }, + { + name: "png", + expectedType: "image/png", + body: convertToData(t, "89504e470d0a1a0a0000000d494844520000025800000258080200000031040f8b0000000467414d410000b18f0bfc610500"), + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + require.Equal(t, test.expectedType, Detect(test.body)) + }) + } +} + +func convertToData(t *testing.T, sample string) string { + t.Helper() + decoded, err := hex.DecodeString(sample) + if err != nil { + t.Fatal(err) + } + return string(decoded) +} diff --git a/libbeat/mime/string.go b/libbeat/mime/string.go new file mode 100644 index 00000000000..40f231e8abc --- /dev/null +++ b/libbeat/mime/string.go @@ -0,0 +1,24 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package mime + +// Detect tries to detect a mime-type based off +// of a byte string passed into the function +func Detect(data string) string { + return DetectBytes([]byte(data)) +} diff --git a/libbeat/processors/actions/detect_mime_type.go b/libbeat/processors/actions/detect_mime_type.go new file mode 100644 index 00000000000..f53794bc1ff --- /dev/null +++ b/libbeat/processors/actions/detect_mime_type.go @@ -0,0 +1,75 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package actions + +import ( + "fmt" + + "github.com/pkg/errors" + + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common" + "github.com/elastic/beats/v7/libbeat/mime" + "github.com/elastic/beats/v7/libbeat/processors" + "github.com/elastic/beats/v7/libbeat/processors/checks" +) + +func init() { + processors.RegisterPlugin("detect_mime_type", + checks.ConfigChecked(NewDetectMimeType, + checks.RequireFields("field", "target"), + checks.AllowedFields("field", "target"))) +} + +type mimeTypeProcessor struct { + Field string `config:"field"` + Target string `config:"target"` +} + +// NewDetectMimeType constructs a new mime processor. +func NewDetectMimeType(cfg *common.Config) (processors.Processor, error) { + mimeType := &mimeTypeProcessor{} + if err := cfg.Unpack(mimeType); err != nil { + return nil, errors.Wrapf(err, "fail to unpack the detect_mime_type configuration") + } + + return mimeType, nil +} + +func (m *mimeTypeProcessor) Run(event *beat.Event) (*beat.Event, error) { + valI, err := event.GetValue(m.Field) + if err != nil { + // doesn't have the required fieldd value to analyze + return event, nil + } + val, _ := valI.(string) + if val == "" { + // wrong type or not set + return event, nil + } + if mimeType := mime.Detect(val); mimeType != "" { + event.Fields.DeepUpdate(common.MapStr{ + m.Target: mimeType, + }) + } + return event, nil +} + +func (m *mimeTypeProcessor) String() string { + return fmt.Sprintf("detect_mime_type=%+v->%+v", m.Field, m.Target) +} diff --git a/libbeat/processors/actions/detect_mime_type_test.go b/libbeat/processors/actions/detect_mime_type_test.go new file mode 100644 index 00000000000..51de6c9062f --- /dev/null +++ b/libbeat/processors/actions/detect_mime_type_test.go @@ -0,0 +1,62 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package actions + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common" +) + +func TestMimeTypeFromTo(t *testing.T) { + evt := beat.Event{ + Fields: common.MapStr{ + "foo.bar.baz": "hello world!", + }, + } + p, err := NewDetectMimeType(common.MustNewConfigFrom(map[string]interface{}{ + "field": "foo.bar.baz", + "target": "bar.baz.zoiks", + })) + require.NoError(t, err) + observed, err := p.Run(&evt) + require.NoError(t, err) + enriched, err := observed.Fields.GetValue("bar.baz.zoiks") + require.NoError(t, err) + require.Equal(t, "text/plain; charset=utf-8", enriched) +} + +func TestMimeTypeTestNoMatch(t *testing.T) { + evt := beat.Event{ + Fields: common.MapStr{ + "foo.bar.baz": string([]byte{0, 0}), + }, + } + p, err := NewDetectMimeType(common.MustNewConfigFrom(map[string]interface{}{ + "field": "foo.bar.baz", + "target": "bar.baz.zoiks", + })) + require.NoError(t, err) + observed, err := p.Run(&evt) + require.NoError(t, err) + hasKey, _ := observed.Fields.HasKey("bar.baz.zoiks") + require.False(t, hasKey) +} diff --git a/libbeat/processors/actions/docs/detect_mime_type.asciidoc b/libbeat/processors/actions/docs/detect_mime_type.asciidoc new file mode 100644 index 00000000000..c93c6f882e9 --- /dev/null +++ b/libbeat/processors/actions/docs/detect_mime_type.asciidoc @@ -0,0 +1,23 @@ +[[detect-mime-type]] +=== Detect mime type + +++++ +detect_mime_type +++++ + +The `detect_mime_type` processor attempts to detect a mime type for a field that +contains a given stream of bytes. The `field` key contains the field used as +the data source and the `target` key contains the field to populate with the detected type + +[source,yaml] +------- +processors: + - detect_mime_type: + field: http.request.body.content + target: http.request.mime_type +------- + +In the example above: + - http.request.body.content is used as the source and http.request.mime_type is set to the detected mime type + +See <> for a list of supported conditions. diff --git a/packetbeat/_meta/config/processors.yml.tmpl b/packetbeat/_meta/config/processors.yml.tmpl index d2cadbe46b1..17b1ca2b540 100644 --- a/packetbeat/_meta/config/processors.yml.tmpl +++ b/packetbeat/_meta/config/processors.yml.tmpl @@ -10,3 +10,9 @@ processors: - add_host_metadata: ~ - add_cloud_metadata: ~ - add_docker_metadata: ~ + - detect_mime_type: + field: http.request.body.content + target: http.request.mime_type + - detect_mime_type: + field: http.response.body.content + target: http.response.mime_type diff --git a/packetbeat/packetbeat.yml b/packetbeat/packetbeat.yml index f7e19b268b8..102ba0fb045 100644 --- a/packetbeat/packetbeat.yml +++ b/packetbeat/packetbeat.yml @@ -214,6 +214,12 @@ processors: - add_host_metadata: ~ - add_cloud_metadata: ~ - add_docker_metadata: ~ + - detect_mime_type: + field: http.request.body.content + target: http.request.mime_type + - detect_mime_type: + field: http.response.body.content + target: http.response.mime_type # ================================== Logging =================================== diff --git a/packetbeat/tests/system/config/packetbeat.yml.j2 b/packetbeat/tests/system/config/packetbeat.yml.j2 index 7b253d8ec2c..cee36a769d9 100644 --- a/packetbeat/tests/system/config/packetbeat.yml.j2 +++ b/packetbeat/tests/system/config/packetbeat.yml.j2 @@ -194,7 +194,16 @@ tags: [ packetbeat.shutdown_timeout: {{ shutdown_timeout|default('400ms') }} -{%- if processors %} +{%- if include_mime %} +processors: + - detect_mime_type: + field: http.request.body.content + target: http.request.mime_type + - detect_mime_type: + field: http.response.body.content + target: http.response.mime_type + +{%- elif processors %} #================================ Filters ===================================== diff --git a/packetbeat/tests/system/test_0063_http_body.py b/packetbeat/tests/system/test_0063_http_body.py index 349624e32d9..17cecb62b7d 100644 --- a/packetbeat/tests/system/test_0063_http_body.py +++ b/packetbeat/tests/system/test_0063_http_body.py @@ -43,6 +43,7 @@ def test_include_body_for_both_request_response(self): """ self.render_config_template( http_include_body_for=["x-www-form-urlencoded", "text/html"], + include_mime=True ) self.run_packetbeat(pcap="http_post.pcap", debug_selectors=["http", "httpdetailed"]) @@ -58,6 +59,8 @@ def test_include_body_for_both_request_response(self): assert len(o["http.request.body.content"]) > 0 assert len(o["http.response.body.content"]) > 0 + assert o["http.request.mime_type"] == "text/plain; charset=utf-8" + assert o["http.response.mime_type"] == "text/html; charset=utf-8" assert "request" not in o assert "response" not in o diff --git a/x-pack/packetbeat/packetbeat.yml b/x-pack/packetbeat/packetbeat.yml index f7e19b268b8..102ba0fb045 100644 --- a/x-pack/packetbeat/packetbeat.yml +++ b/x-pack/packetbeat/packetbeat.yml @@ -214,6 +214,12 @@ processors: - add_host_metadata: ~ - add_cloud_metadata: ~ - add_docker_metadata: ~ + - detect_mime_type: + field: http.request.body.content + target: http.request.mime_type + - detect_mime_type: + field: http.response.body.content + target: http.response.mime_type # ================================== Logging ===================================