From e948232820c1d3d6a6241c04d10dba1ba96fafc6 Mon Sep 17 00:00:00 2001
From: maxunt <max.untrecht@gmail.com>
Date: Fri, 17 Aug 2018 13:45:22 -0700
Subject: [PATCH] Add ability to set measurement from matched text in grok
 parser (#4433)

---
 docs/DATA_FORMATS_INPUT.md                    | 91 ++++++++++---------
 plugins/inputs/file/README.md                 |  2 +-
 plugins/inputs/file/dev/docker-compose.yml    |  2 +-
 plugins/inputs/file/dev/json_a.log            | 14 ---
 .../file/{ => dev}/testfiles/grok_a.log       |  0
 .../file/{ => dev}/testfiles/json_a.log       |  0
 plugins/inputs/file/file.go                   |  9 +-
 plugins/inputs/file/file_test.go              | 12 +--
 plugins/parsers/grok/parser.go                |  6 +-
 plugins/parsers/grok/parser_test.go           | 50 ++++++++++
 10 files changed, 112 insertions(+), 74 deletions(-)
 delete mode 100644 plugins/inputs/file/dev/json_a.log
 rename plugins/inputs/file/{ => dev}/testfiles/grok_a.log (100%)
 rename plugins/inputs/file/{ => dev}/testfiles/json_a.log (100%)

diff --git a/docs/DATA_FORMATS_INPUT.md b/docs/DATA_FORMATS_INPUT.md
index 753523843f249..ded0170ec80d2 100644
--- a/docs/DATA_FORMATS_INPUT.md
+++ b/docs/DATA_FORMATS_INPUT.md
@@ -670,50 +670,6 @@ The best way to get acquainted with grok patterns is to read the logstash docs,
 which are available here:
   https://www.elastic.co/guide/en/logstash/current/plugins-filters-grok.html
 
-#### Grok Configuration:
-```toml
-[[inputs.file]]
-  ## Files to parse each interval.
-  ## These accept standard unix glob matching rules, but with the addition of
-  ## ** as a "super asterisk". ie:
-  ##   /var/log/**.log     -> recursively find all .log files in /var/log
-  ##   /var/log/*/*.log    -> find all .log files with a parent dir in /var/log
-  ##   /var/log/apache.log -> only tail the apache log file
-  files = ["/var/log/apache/access.log"]
-
-  ## The dataformat to be read from files
-  ## Each data format has its own unique set of configuration options, read
-  ## more about them here:
-  ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
-  data_format = "grok"
-
-  ## This is a list of patterns to check the given log file(s) for.
-  ## Note that adding patterns here increases processing time. The most
-  ## efficient configuration is to have one pattern.
-  ## Other common built-in patterns are:
-  ##   %{COMMON_LOG_FORMAT}   (plain apache & nginx access logs)
-  ##   %{COMBINED_LOG_FORMAT} (access logs + referrer & agent)
-  grok_patterns = ["%{COMBINED_LOG_FORMAT}"]
-
-  ## Full path(s) to custom pattern files.
-  grok_custom_pattern_files = []
-
-  ## Custom patterns can also be defined here. Put one pattern per line.
-  grok_custom_patterns = '''
-  '''
-
-  ## Timezone allows you to provide an override for timestamps that
-  ## don't already include an offset
-  ## e.g. 04/06/2016 12:41:45 data one two 5.43µs
-  ##
-  ## Default: "" which renders UTC
-  ## Options are as follows:
-  ##   1. Local             -- interpret based on machine localtime
-  ##   2. "Canada/Eastern"  -- Unix TZ values like those found in https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
-  ##   3. UTC               -- or blank/unspecified, will return timestamp in UTC
-  grok_timezone = "Canada/Eastern"
-```
-
 The grok parser uses a slightly modified version of logstash "grok"
 patterns, with the format:
 
@@ -740,6 +696,7 @@ You must capture at least one field per line.
   - duration (ie, 5.23ms gets converted to int nanoseconds)
   - tag      (converts the field into a tag)
   - drop     (drops the field completely)
+  - measurement (use the matched text as the measurement name)
 - Timestamp modifiers:
   - ts               (This will auto-learn the timestamp format)
   - ts-ansic         ("Mon Jan _2 15:04:05 2006")
@@ -759,7 +716,7 @@ You must capture at least one field per line.
   - ts-"CUSTOM"
 
 CUSTOM time layouts must be within quotes and be the representation of the
-"reference time", which is `Mon Jan 2 15:04:05 -0700 MST 2006`.  
+"reference time", which is `Mon Jan 2 15:04:05 -0700 MST 2006`.
 To match a comma decimal point you can use a period.  For example `%{TIMESTAMP:timestamp:ts-"2006-01-02 15:04:05.000"}` can be used to match `"2018-01-02 15:04:05,000"`
 To match a comma decimal point you can use a period in the pattern string.
 See https://golang.org/pkg/time/#Parse for more details.
@@ -773,6 +730,50 @@ logstash patterns that depend on these are not supported._
 If you need help building patterns to match your logs,
 you will find the https://grokdebug.herokuapp.com application quite useful!
 
+#### Grok Configuration:
+```toml
+[[inputs.file]]
+  ## Files to parse each interval.
+  ## These accept standard unix glob matching rules, but with the addition of
+  ## ** as a "super asterisk". ie:
+  ##   /var/log/**.log     -> recursively find all .log files in /var/log
+  ##   /var/log/*/*.log    -> find all .log files with a parent dir in /var/log
+  ##   /var/log/apache.log -> only tail the apache log file
+  files = ["/var/log/apache/access.log"]
+
+  ## The dataformat to be read from files
+  ## Each data format has its own unique set of configuration options, read
+  ## more about them here:
+  ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
+  data_format = "grok"
+
+  ## This is a list of patterns to check the given log file(s) for.
+  ## Note that adding patterns here increases processing time. The most
+  ## efficient configuration is to have one pattern.
+  ## Other common built-in patterns are:
+  ##   %{COMMON_LOG_FORMAT}   (plain apache & nginx access logs)
+  ##   %{COMBINED_LOG_FORMAT} (access logs + referrer & agent)
+  grok_patterns = ["%{COMBINED_LOG_FORMAT}"]
+
+  ## Full path(s) to custom pattern files.
+  grok_custom_pattern_files = []
+
+  ## Custom patterns can also be defined here. Put one pattern per line.
+  grok_custom_patterns = '''
+  '''
+
+  ## Timezone allows you to provide an override for timestamps that
+  ## don't already include an offset
+  ## e.g. 04/06/2016 12:41:45 data one two 5.43µs
+  ##
+  ## Default: "" which renders UTC
+  ## Options are as follows:
+  ##   1. Local             -- interpret based on machine localtime
+  ##   2. "Canada/Eastern"  -- Unix TZ values like those found in https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
+  ##   3. UTC               -- or blank/unspecified, will return timestamp in UTC
+  grok_timezone = "Canada/Eastern"
+```
+
 #### Timestamp Examples
 
 This example input and config parses a file using a custom timestamp conversion:
diff --git a/plugins/inputs/file/README.md b/plugins/inputs/file/README.md
index 73a3a2362e0f0..4358b67ad2668 100644
--- a/plugins/inputs/file/README.md
+++ b/plugins/inputs/file/README.md
@@ -14,7 +14,7 @@ use the [tail input plugin](/plugins/inputs/tail) instead.
   ## ** as a "super asterisk". ie:
   ##   /var/log/**.log     -> recursively find all .log files in /var/log
   ##   /var/log/*/*.log    -> find all .log files with a parent dir in /var/log
-  ##   /var/log/apache.log -> only tail the apache log file
+  ##   /var/log/apache.log -> only read the apache log file
   files = ["/var/log/apache/access.log"]
 
   ## Data format to consume.
diff --git a/plugins/inputs/file/dev/docker-compose.yml b/plugins/inputs/file/dev/docker-compose.yml
index 3c16fca909ebd..efce389f78424 100644
--- a/plugins/inputs/file/dev/docker-compose.yml
+++ b/plugins/inputs/file/dev/docker-compose.yml
@@ -6,7 +6,7 @@ services:
       volumes:
         - ./telegraf.conf:/telegraf.conf
         - ../../../../telegraf:/telegraf
-        - ./json_a.log:/var/log/test.log
+        - ./dev/json_a.log:/var/log/test.log
       entrypoint:
         - /telegraf
         - --config
diff --git a/plugins/inputs/file/dev/json_a.log b/plugins/inputs/file/dev/json_a.log
deleted file mode 100644
index 0f52e9d1e3b57..0000000000000
--- a/plugins/inputs/file/dev/json_a.log
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-"parent": {
-	"child": 3.0,
-	"ignored_child": "hi"
-},
-"ignored_null": null,
-"integer": 4,
-"list": [3, 4],
-"ignored_parent": {
-	"another_ignored_null": null,
-	"ignored_string": "hello, world!"
-},
-"another_list": [4]
-}
diff --git a/plugins/inputs/file/testfiles/grok_a.log b/plugins/inputs/file/dev/testfiles/grok_a.log
similarity index 100%
rename from plugins/inputs/file/testfiles/grok_a.log
rename to plugins/inputs/file/dev/testfiles/grok_a.log
diff --git a/plugins/inputs/file/testfiles/json_a.log b/plugins/inputs/file/dev/testfiles/json_a.log
similarity index 100%
rename from plugins/inputs/file/testfiles/json_a.log
rename to plugins/inputs/file/dev/testfiles/json_a.log
diff --git a/plugins/inputs/file/file.go b/plugins/inputs/file/file.go
index 2779561fc2ffb..d6714301eaed2 100644
--- a/plugins/inputs/file/file.go
+++ b/plugins/inputs/file/file.go
@@ -11,9 +11,8 @@ import (
 )
 
 type File struct {
-	Files         []string `toml:"files"`
-	FromBeginning bool
-	parser        parsers.Parser
+	Files  []string `toml:"files"`
+	parser parsers.Parser
 
 	filenames []string
 }
@@ -24,7 +23,7 @@ const sampleConfig = `
   ## ** as a "super asterisk". ie:
   ##   /var/log/**.log     -> recursively find all .log files in /var/log
   ##   /var/log/*/*.log    -> find all .log files with a parent dir in /var/log
-  ##   /var/log/apache.log -> only tail the apache log file
+  ##   /var/log/apache.log -> only read the apache log file
   files = ["/var/log/apache/access.log"]
 
   ## The dataformat to be read from files
@@ -40,7 +39,7 @@ func (f *File) SampleConfig() string {
 }
 
 func (f *File) Description() string {
-	return "reload and gather from file[s] on telegraf's interval"
+	return "Reload and gather from file[s] on telegraf's interval."
 }
 
 func (f *File) Gather(acc telegraf.Accumulator) error {
diff --git a/plugins/inputs/file/file_test.go b/plugins/inputs/file/file_test.go
index 28105664615a1..43322c2e84cf9 100644
--- a/plugins/inputs/file/file_test.go
+++ b/plugins/inputs/file/file_test.go
@@ -14,26 +14,26 @@ import (
 func TestRefreshFilePaths(t *testing.T) {
 	wd, err := os.Getwd()
 	r := File{
-		Files: []string{filepath.Join(wd, "testfiles/**.log")},
+		Files: []string{filepath.Join(wd, "dev/testfiles/**.log")},
 	}
 
 	err = r.refreshFilePaths()
 	require.NoError(t, err)
-	assert.Equal(t, len(r.filenames), 2)
+	assert.Equal(t, 2, len(r.filenames))
 }
 func TestJSONParserCompile(t *testing.T) {
 	var acc testutil.Accumulator
 	wd, _ := os.Getwd()
 	r := File{
-		Files: []string{filepath.Join(wd, "testfiles/json_a.log")},
+		Files: []string{filepath.Join(wd, "dev/testfiles/json_a.log")},
 	}
 	parserConfig := parsers.Config{
 		DataFormat: "json",
 		TagKeys:    []string{"parent_ignored_child"},
 	}
 	nParser, err := parsers.NewParser(&parserConfig)
-	r.parser = nParser
 	assert.NoError(t, err)
+	r.parser = nParser
 
 	r.Gather(&acc)
 	assert.Equal(t, map[string]string{"parent_ignored_child": "hi"}, acc.Metrics[0].Tags)
@@ -44,7 +44,7 @@ func TestGrokParser(t *testing.T) {
 	wd, _ := os.Getwd()
 	var acc testutil.Accumulator
 	r := File{
-		Files: []string{filepath.Join(wd, "testfiles/grok_a.log")},
+		Files: []string{filepath.Join(wd, "dev/testfiles/grok_a.log")},
 	}
 
 	parserConfig := parsers.Config{
@@ -57,5 +57,5 @@ func TestGrokParser(t *testing.T) {
 	assert.NoError(t, err)
 
 	err = r.Gather(&acc)
-	assert.Equal(t, 2, len(acc.Metrics))
+	assert.Equal(t, len(acc.Metrics), 2)
 }
diff --git a/plugins/parsers/grok/parser.go b/plugins/parsers/grok/parser.go
index 096cb8ed830e6..bc65588eb9841 100644
--- a/plugins/parsers/grok/parser.go
+++ b/plugins/parsers/grok/parser.go
@@ -38,6 +38,7 @@ var timeLayouts = map[string]string{
 }
 
 const (
+	MEASUREMENT       = "measurement"
 	INT               = "int"
 	TAG               = "tag"
 	FLOAT             = "float"
@@ -217,7 +218,6 @@ func (p *Parser) ParseLine(line string) (telegraf.Metric, error) {
 		if k == "" || v == "" {
 			continue
 		}
-
 		// t is the modifier of the field
 		var t string
 		// check if pattern has some modifiers
@@ -239,6 +239,8 @@ func (p *Parser) ParseLine(line string) (telegraf.Metric, error) {
 		}
 
 		switch t {
+		case MEASUREMENT:
+			p.Measurement = v
 		case INT:
 			iv, err := strconv.ParseInt(v, 10, 64)
 			if err != nil {
@@ -350,7 +352,7 @@ func (p *Parser) ParseLine(line string) (telegraf.Metric, error) {
 	}
 
 	if len(fields) == 0 {
-		return nil, fmt.Errorf("logparser_grok: must have one or more fields")
+		return nil, fmt.Errorf("grok: must have one or more fields")
 	}
 
 	return metric.New(p.Measurement, tags, fields, p.tsModder.tsMod(timestamp))
diff --git a/plugins/parsers/grok/parser_test.go b/plugins/parsers/grok/parser_test.go
index 09f8fa16d89b5..8133d30212156 100644
--- a/plugins/parsers/grok/parser_test.go
+++ b/plugins/parsers/grok/parser_test.go
@@ -1,6 +1,7 @@
 package grok
 
 import (
+	"log"
 	"testing"
 	"time"
 
@@ -959,3 +960,52 @@ func TestReplaceTimestampComma(t *testing.T) {
 	//Convert Nanosecond to milisecond for compare
 	require.Equal(t, 555, m.Time().Nanosecond()/1000000)
 }
+
+func TestDynamicMeasurementModifier(t *testing.T) {
+	p := &Parser{
+		Patterns:       []string{"%{TEST}"},
+		CustomPatterns: "TEST %{NUMBER:var1:tag} %{NUMBER:var2:float} %{WORD:test:measurement}",
+	}
+
+	require.NoError(t, p.Compile())
+	m, err := p.ParseLine("4 5 hello")
+	require.NoError(t, err)
+	require.Equal(t, m.Name(), "hello")
+}
+
+func TestStaticMeasurementModifier(t *testing.T) {
+	p := &Parser{
+		Patterns: []string{"%{WORD:hi:measurement} %{NUMBER:num:string}"},
+	}
+
+	require.NoError(t, p.Compile())
+	m, err := p.ParseLine("test_name 42")
+	log.Printf("%v", m)
+	require.NoError(t, err)
+	require.Equal(t, "test_name", m.Name())
+}
+
+// tests that the top level measurement name is used
+func TestTwoMeasurementModifier(t *testing.T) {
+	p := &Parser{
+		Patterns:       []string{"%{TEST:test_name:measurement}"},
+		CustomPatterns: "TEST %{NUMBER:var1:tag} %{NUMBER:var2:measurement} %{WORD:var3:measurement}",
+	}
+
+	require.NoError(t, p.Compile())
+	m, err := p.ParseLine("4 5 hello")
+	require.NoError(t, err)
+	require.Equal(t, m.Name(), "4 5 hello")
+}
+
+func TestMeasurementModifierNoName(t *testing.T) {
+	p := &Parser{
+		Patterns:       []string{"%{TEST}"},
+		CustomPatterns: "TEST %{NUMBER:var1:tag} %{NUMBER:var2:float} %{WORD:hi:measurement}",
+	}
+
+	require.NoError(t, p.Compile())
+	m, err := p.ParseLine("4 5 hello")
+	require.NoError(t, err)
+	require.Equal(t, m.Name(), "hello")
+}