Skip to content
This repository has been archived by the owner on Feb 8, 2024. It is now read-only.

Commit

Permalink
Merge pull request #29 from gravitational/jeff/kapacitor
Browse files Browse the repository at this point in the history
Include Kapacitor in monitoring-app
  • Loading branch information
sofuture authored May 16, 2017
2 parents 52fd282 + a6f9351 commit 51017cb
Show file tree
Hide file tree
Showing 11 changed files with 418 additions and 3 deletions.
14 changes: 11 additions & 3 deletions images/Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
.PHONY: all heapster heapster-build heapster-clean influxdb influxdb-build \
influxdb-clean grafana deploy
influxdb-clean grafana kapacitor telegraf deploy

PWD=$(shell pwd)
.DEFAULT_GOAL := all

INFLUXDB_TAG := monitoring-influxdb:1.2.2
HEAPSTER_TAG := monitoring-heapster:1.0.2
GRAFANA_TAG := monitoring-grafana:3.0.4
KAPACITOR_TAG := monitoring-kapacitor:1.2
TELEGRAF_TAG := monitoring-telegraf:1.2.1
HOOK_TAG := monitoring-hook:$(VER)

REPO_URL := quay.io/gravitational
Expand Down Expand Up @@ -40,12 +42,18 @@ influxdb: influxdb-build
grafana:
docker build --pull -t $(GRAFANA_TAG) $@

all: heapster influxdb grafana hook
kapacitor:
docker build --pull -t $(KAPACITOR_TAG) $@

telegraf:
docker build --pull -t $(TELEGRAF_TAG) $@

all: heapster influxdb grafana kapacitor telegraf hook

clean: heapster-clean influxdb-clean

.PHONY: deploy
deploy:
$(foreach ct,$(INFLUXDB_TAG) $(HEAPSTER_TAG) $(GRAFANA_TAG), \
$(foreach ct,$(INFLUXDB_TAG) $(HEAPSTER_TAG) $(GRAFANA_TAG) $(KAPACITOR_TAG), \
docker tag $(ct) $(REPO_URL)/$(ct) ; \
docker push $(REPO_URL)/$(ct) ; )
28 changes: 28 additions & 0 deletions images/kapacitor/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
FROM quay.io/gravitational/debian-grande:0.0.1

RUN apt-get update && \
apt-get install -y curl wget

RUN gpg \
--keyserver hkp://pool.sks-keyservers.net \
--recv-keys 05CE15085FC09D18E99EFB22684A14CF2582E0C5

ENV KAPACITOR_VERSION 1.2.1
RUN wget -q https://dl.influxdata.com/kapacitor/releases/kapacitor_${KAPACITOR_VERSION}_amd64.deb.asc && \
wget -q https://dl.influxdata.com/kapacitor/releases/kapacitor_${KAPACITOR_VERSION}_amd64.deb && \
gpg --batch --verify kapacitor_${KAPACITOR_VERSION}_amd64.deb.asc kapacitor_${KAPACITOR_VERSION}_amd64.deb && \
dpkg -i kapacitor_${KAPACITOR_VERSION}_amd64.deb && \
rm -f kapacitor_${KAPACITOR_VERSION}_amd64.deb*
COPY kapacitor.conf /etc/kapacitor/kapacitor.conf

EXPOSE 9092

VOLUME /var/lib/kapacitor

RUN test -f /cleanup.sh && sh /cleanup.sh

COPY entrypoint.sh /entrypoint.sh
COPY loadalerts.sh /loadalerts.sh

ENTRYPOINT ["/usr/bin/dumb-init", "--"]
CMD ["/entrypoint.sh"]
7 changes: 7 additions & 0 deletions images/kapacitor/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
set -e

KAPACITOR_HOSTNAME=${KAPACITOR_HOSTNAME:-$HOSTNAME}
export KAPACITOR_HOSTNAME

kapacitord
20 changes: 20 additions & 0 deletions images/kapacitor/kapacitor.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
data_dir = "/var/lib/kapacitor"

[replay]
dir = "/var/lib/kapacitor/replay"

[storage]
boltdb = "/var/lib/kapacitor/kapacitor.db"

[smtp]
enabled = false
host = "localhost"
port = 25
username = ""
password = ""
from = ""
to = ["[email protected]"]
no-verify = false
idle-timeout = "30s"
global = false
state-changes-only = false
17 changes: 17 additions & 0 deletions images/kapacitor/loadalerts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

URL=${URL:-http://localhost:9092}
ALERTDIR=${ALERTDIR:-/opt/alerts}

while true; do
for alert in $ALERTDIR/*.tick; do
filename=$(basename "$alert")
alertname="${filename%.*}"
if ! kapacitor -url $URL list tasks | grep -q $alertname ; then
echo "alert $alertname doesn't exist, creating"
kapacitor -url $URL define $alertname -type stream -dbrp k8s.default -tick $alert
kapacitor -url $URL enable $alertname
fi
done
sleep 5
done
27 changes: 27 additions & 0 deletions images/telegraf/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
FROM quay.io/gravitational/debian-grande:0.0.1

ENV DEBIAN_FRONTEND=noninteractive \
TERM=xterm \
TELEGRAF_VERSION=1.2.1

RUN apt-get update && \
apt-get install --yes --no-install-recommends curl tar && \
curl -sSL https://dl.influxdata.com/telegraf/releases/telegraf-${TELEGRAF_VERSION}_linux_amd64.tar.gz -o /telegraf.tar.gz && \
tar xzf /telegraf.tar.gz --strip-components=2 && \
apt-get clean && \
rm -rf \
/var/lib/apt/lists/* \
~/.bashrc \
/usr/share/doc/ \
/usr/share/doc-base/ \
/usr/share/man/ \
/tmp/* \
/telegraf.tar.gz \
/etc/telegraf/*

ADD rootfs/ /

RUN chmod a+rx /usr/local/bin/run.sh

ENTRYPOINT ["/usr/bin/dumb-init", "--"]
CMD ["/usr/local/bin/run.sh"]
115 changes: 115 additions & 0 deletions images/telegraf/rootfs/etc/telegraf/telegraf.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Telegraf Configuration
#
# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared inputs, and sent to the declared outputs.
#
# Plugins must be declared in here to be active.
# To deactivate a plugin, comment out the name and any variables.
#
# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
# file would generate.
#
# Environment variables can be used anywhere in this config file, simply prepend
# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"),
# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR)


# Global tags can be specified here in key="value" format.
[global_tags]
# dc = "us-east-1" # will tag all metrics with dc=us-east-1
# rack = "1a"
## Environment variables can be used as tags, and throughout the config file
# user = "$USER"


# Configuration for telegraf agent
[agent]
## Default data collection interval for all inputs
interval = "20s"
## Rounds collection interval to 'interval'
## ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true

## Telegraf will send metrics to outputs in batches of at
## most metric_batch_size metrics.
metric_batch_size = 1000
## For failed writes, telegraf will cache metric_buffer_limit metrics for each
## output, and will flush this buffer on a successful write. Oldest metrics
## are dropped first when this buffer fills.
metric_buffer_limit = 10000

## Collection jitter is used to jitter the collection by a random amount.
## Each plugin will sleep for a random time within jitter before collecting.
## This can be used to avoid many plugins querying things like sysfs at the
## same time, which can have a measurable effect on the system.
collection_jitter = "0s"

## Default flushing interval for all outputs. You shouldn't set this below
## interval. Maximum flush_interval will be flush_interval + flush_jitter
flush_interval = "10s"
## Jitter the flush interval by a random amount. This is primarily to avoid
## large write spikes for users running a large number of telegraf instances.
## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "0s"

## By default, precision will be set to the same timestamp order as the
## collection interval, with the maximum being 1s.
## Precision will NOT be used for service inputs, such as logparser and statsd.
## Valid values are "ns", "us" (or "µs"), "ms", "s".
precision = ""
## Run telegraf in debug mode
debug = false
## Run telegraf in quiet mode
quiet = true
## Override default hostname, if empty use os.Hostname()
hostname = ""
## If set to true, do no set the "host" tag in the telegraf agent.
omit_hostname = false


###############################################################################
# OUTPUT PLUGINS #
###############################################################################

# Configuration for influxdb server to send metrics to
[[outputs.influxdb]]
## The full HTTP or UDP endpoint URL for your InfluxDB instance.
## Multiple urls can be specified as part of the same cluster,
## this means that only ONE of the urls will be written to each interval.
# urls = ["udp://localhost:8089"] # UDP endpoint example
urls = ["http://influxdb.kube-system.svc:8086"] # required
## The target database for metrics (telegraf will create it if not exists).
database = "k8s" # required

## Retention policy to write to. Empty string writes to the default rp.
retention_policy = ""
## Write consistency (clusters only), can be: "any", "one", "quorum", "all"
write_consistency = "any"

## Write timeout (for the InfluxDB client), formatted as a string.
## If not provided, will default to 5s. 0s means no timeout (not recommended).
timeout = "5s"
# username = "telegraf"
# password = "metricsmetricsmetricsmetrics"
## Set the user agent for HTTP POSTs (can be useful for log differentiation)
# user_agent = "telegraf"
## Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
# udp_payload = 512

## Optional SSL Config
# ssl_ca = "/etc/telegraf/ca.pem"
# ssl_cert = "/etc/telegraf/cert.pem"
# ssl_key = "/etc/telegraf/key.pem"
## Use SSL but skip chain & host verification
# insecure_skip_verify = false


###############################################################################
# INPUT PLUGINS #
###############################################################################

[[inputs.prometheus]]
name_prefix = "k8s_"
urls = ["https://kubernetes.default.svc.cluster.local/metrics"]
ssl_ca = '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
bearer_token = '/var/run/secrets/kubernetes.io/serviceaccount/token'
4 changes: 4 additions & 0 deletions images/telegraf/rootfs/usr/local/bin/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env bash

# setup and start telegraf
/usr/bin/telegraf --config /etc/telegraf/telegraf.conf
34 changes: 34 additions & 0 deletions resources/alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: kapacitor-alerts
namespace: kube-system
data:
high_cpu.tick: |
stream
|from()
.measurement('cpu/node_utilization')
.groupBy('nodename')
|window()
.period(5m)
.every(5m)
|mean('value').as('used')
|alert()
.message('{{ .Level}}: {{ .Name }}/{{ index .Tags "nodename" }} has high cpu usage: {{ index .Fields "used" }}%')
.warn(lambda: "used" > 0.70)
.crit(lambda: "used" > 0.80)
.email()
high_memory.tick: |
stream
|from()
.measurement('memory/node_utilization')
.groupBy('nodename')
|window()
.period(5m)
.every(5m)
|mean('value').as('used')
|alert()
.message('{{ .Level}}: {{ .Name }}/{{ index .Tags "nodename" }} has high memory usage: {{ index .Fields "used" }}%')
.warn(lambda: "used" > 0.70)
.crit(lambda: "used" > 0.80)
.email()
2 changes: 2 additions & 0 deletions resources/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ else
fi

/opt/bin/kubectl create -f /var/lib/gravity/resources/resources.yaml
/opt/bin/kubectl create -f /var/lib/gravity/resources/alerts.yaml

Loading

0 comments on commit 51017cb

Please sign in to comment.