Skip to content

Commit

Permalink
Add metrics, dashboard, and test suite
Browse files Browse the repository at this point in the history
  • Loading branch information
Lindsay Hanks authored and jdn5126 committed Sep 12, 2023
1 parent 94ca03a commit 320cd8a
Show file tree
Hide file tree
Showing 15 changed files with 3,864 additions and 0 deletions.
2,404 changes: 2,404 additions & 0 deletions config/grafana/grafana_dashboard.json

Large diffs are not rendered by default.

40 changes: 40 additions & 0 deletions pkg/ipamd/datastore/data_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,19 @@ var (
},
[]string{"cidr"},
)
noAvailableAddrs = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "awscni_err_no_avail_addrs",
Help: "The number of IP/Prefix assignments that fail due to no available addresses at the ENI level",
},
)
eniUtilization = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "awscni_eni_util",
Help: "The number of allocated ips partitioned by eni",
},
[]string{"fn"},
)
prometheusRegistered = false
)

Expand Down Expand Up @@ -344,6 +357,8 @@ func prometheusRegister() {
prometheus.MustRegister(forceRemovedIPs)
prometheus.MustRegister(totalPrefixes)
prometheus.MustRegister(ipsPerCidr)
prometheus.MustRegister(noAvailableAddrs)
prometheus.MustRegister(eniUtilization)
prometheusRegistered = true
}
}
Expand Down Expand Up @@ -521,6 +536,7 @@ func (ds *DataStore) AddENI(eniID string, deviceNumber int, isPrimary, isTrunk,
DeviceNumber: deviceNumber,
AvailableIPv4Cidrs: make(map[string]*CidrInfo)}

ds.GetENIUtilization()
enis.Set(float64(len(ds.eniPool)))
return nil
}
Expand Down Expand Up @@ -714,6 +730,7 @@ func (ds *DataStore) AssignPodIPv6Address(ipamKey IPAMKey, ipamMetadata IPAMMeta
return addr.Address, eni.DeviceNumber, nil
}
}
noAvailableAddrs.Inc()
return "", -1, errors.New("assignPodIPv6AddressUnsafe: no available IP addresses")
}

Expand Down Expand Up @@ -781,6 +798,7 @@ func (ds *DataStore) AssignPodIPv4Address(ipamKey IPAMKey, ipamMetadata IPAMMeta
ds.log.Debugf("AssignPodIPv4Address: ENI %s does not have available addresses", eni.ID)
}

noAvailableAddrs.Inc()
ds.log.Errorf("DataStore has no available IP/Prefix addresses")
return "", -1, errors.New("assignPodIPv4AddressUnsafe: no available IP/Prefix addresses")
}
Expand All @@ -797,6 +815,7 @@ func (ds *DataStore) assignPodIPAddressUnsafe(addr *AddressInfo, ipamKey IPAMKey
addr.IPAMMetadata = ipamMetadata
addr.AssignedTime = assignedTime

ds.log.Debugf("IP allocation request")
ds.assigned++
// Prometheus gauge
assignedIPs.Set(float64(ds.assigned))
Expand All @@ -813,6 +832,7 @@ func (ds *DataStore) unassignPodIPAddressUnsafe(addr *AddressInfo) {
addr.IPAMKey = IPAMKey{} // unassign the addr
addr.IPAMMetadata = IPAMMetadata{}
ds.assigned--
ds.log.Debugf("IP deallocation request")
// Prometheus gauge
assignedIPs.Set(float64(ds.assigned))
}
Expand Down Expand Up @@ -866,6 +886,24 @@ func (ds *DataStore) GetIPStats(addressFamily string) *DataStoreStats {
return stats
}

// GetENIUtilization updates a Prometheus gauge vector with each ENIs id and how many ip addresses are assigned on it
func (ds *DataStore) GetENIUtilization() {
//eniUtilization.Reset()
for _, eni := range ds.eniPool {
count := 0
for _, assignedAddr := range eni.AvailableIPv4Cidrs {
for _, addr := range assignedAddr.IPAddresses {
if addr.Assigned() {
count += 1
}
}
}
utilization := count
eniID := eni.ID
eniUtilization.WithLabelValues(eniID).Set(float64(utilization))
}
}

// GetTrunkENI returns the trunk ENI ID or an empty string
func (ds *DataStore) GetTrunkENI() string {
ds.lock.Lock()
Expand Down Expand Up @@ -1072,6 +1110,7 @@ func (ds *DataStore) RemoveUnusedENIFromStore(warmIPTarget, minimumIPTarget, war

// Prometheus update
enis.Set(float64(len(ds.eniPool)))
ds.GetENIUtilization()
totalIPs.Set(float64(ds.total))
return removableENI
}
Expand Down Expand Up @@ -1126,6 +1165,7 @@ func (ds *DataStore) RemoveENIFromDataStore(eniID string, force bool) error {

// Prometheus gauge
enis.Set(float64(len(ds.eniPool)))
ds.GetENIUtilization()
return nil
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/ipamd/ipamd.go
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,8 @@ func (c *IPAMContext) updateIPPoolIfRequired(ctx context.Context) {
if c.shouldRemoveExtraENIs() {
c.tryFreeENI()
}
// Prometheus Metric
c.dataStore.GetENIUtilization()
}

// decreaseDatastorePool runs every `interval` and attempts to return unused ENIs and IPs
Expand Down
26 changes: 26 additions & 0 deletions test/integration/warm-pool/clear_warm_env.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package warm_pool

import (
k8sUtils "github.com/aws/amazon-vpc-cni-k8s/test/framework/resources/k8s/utils"
. "github.com/onsi/ginkgo/v2"
)

// Environment variables are not reset before and after each test so that way multiple tests can be run to
// evaluate behavior. You can run this test which will unset all warm pool environment variables. Or, if you
// want to test the behavior with some of those environment variables set, alter them in that file and run it once before
// you run the desired tests.
var _ = Describe("clear warm env", func() {
Context("Clear out environment variables for warm pool for testing", func() {

It("Unsetting env variables", func() {
k8sUtils.UpdateEnvVarOnDaemonSetAndWaitUntilReady(f, "aws-node", "kube-system",
"aws-node", map[string]string{},
map[string]struct{}{
"WARM_ENI_TARGET": {},
"WARM_IP_TARGET": {},
"MINIMUM_IP_TARGET": {},
"WARM_PREFIX_TARGET": {},
})
})
})
})
26 changes: 26 additions & 0 deletions test/integration/warm-pool/set_warm_env.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package warm_pool

import (
k8sUtils "github.com/aws/amazon-vpc-cni-k8s/test/framework/resources/k8s/utils"
"github.com/aws/amazon-vpc-cni-k8s/test/framework/utils"
. "github.com/onsi/ginkgo/v2"
"strconv"
)

// Environment variables are not reset before and after each test so that way multiple tests can be run to
// evaluate behavior. You can run this test which will unset all warm pool environment variables. Or, if you
// want to test the behavior with some of those environment variables set, alter them in that file and run it once before
// you run the desired tests.
var _ = Describe("set warm env", func() {
Context("Sets env variables", func() {

It("Sets env variables", func() {
k8sUtils.AddEnvVarToDaemonSetAndWaitTillUpdated(f,
utils.AwsNodeName, utils.AwsNodeNamespace, utils.AwsNodeName,
map[string]string{
"WARM_IP_TARGET": strconv.Itoa(0),
"ENABLE_DYNAMIC_WARM_POOL": strconv.FormatBool(true),
})
})
})
})
98 changes: 98 additions & 0 deletions test/integration/warm-pool/use_case_1_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package warm_pool

import (
"fmt"
"github.com/aws/amazon-vpc-cni-k8s/test/framework/resources/k8s/manifest"
"github.com/aws/amazon-vpc-cni-k8s/test/framework/utils"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
v1 "k8s.io/api/core/v1"
"time"
)

var primaryNode v1.Node

// This test scales up the cluster to maxPods, then scales it back down to minPods.
var _ = Describe("use case 1", func() {
Context("Quick Scale Up and Down", func() {

BeforeEach(func() {
By("Getting Warm Pool Environment Variables Before Test")
getWarmPoolEnvVars()
})

It("Scales the cluster and checks warm pool before and after", func() {
fmt.Fprintf(GinkgoWriter, "Deploying %v minimum pods\n", minPods)

start := time.Now().Unix()

fmt.Fprintf(GinkgoWriter, "Scaling cluster up to %v pods\n", minPods)
deploymentSpec := manifest.NewBusyBoxDeploymentBuilder(f.Options.TestImageRegistry).
Namespace("default").
Name("busybox").
NodeName(primaryNode.Name).
Namespace(utils.DefaultTestNamespace).
Replicas(minPods).
Build()

_, err := f.K8sResourceManagers.
DeploymentManager().
CreateAndWaitTillDeploymentIsReady(deploymentSpec, utils.DefaultDeploymentReadyTimeout*5)
Expect(err).ToNot(HaveOccurred())

if minPods != 0 {
time.Sleep(sleep)
}

fmt.Fprintf(GinkgoWriter, "Scaling cluster up to %v pods\n", maxPods)
quickScale(maxPods)

Expect(maxPods).To(Equal(busyboxPodCnt()))

fmt.Fprintf(GinkgoWriter, "Scaling cluster down to %v pods\n", minPods)
quickScale(minPods)

end := time.Now().Unix()

fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Start Time: %v\n", start))
fmt.Fprintf(GinkgoWriter, fmt.Sprintf("End Time: %v\n", end))

By("Starting Curl Container")
curlContainer := manifest.NewCurlContainer().
Command([]string{"sleep", "1000"}).Build()

getCurlPod := manifest.NewDefaultPodBuilder().
Name("curl-pod").
Namespace(utils.DefaultTestNamespace).
NodeName(primaryNode.Name).
HostNetwork(true).
Container(curlContainer).
Build()

testPod, err := f.K8sResourceManagers.PodManager().
CreateAndWaitTillPodCompleted(getCurlPod)

logs, errLogs := f.K8sResourceManagers.PodManager().
PodLogs(testPod.Namespace, testPod.Name)
Expect(errLogs).ToNot(HaveOccurred())
fmt.Fprintln(GinkgoWriter, logs)

By("Fetching metrics via Curl Container")
getMetrics(start, end)

By("Deleting the deployment")
err = f.K8sResourceManagers.DeploymentManager().DeleteAndWaitTillDeploymentIsDeleted(deploymentSpec)
Expect(err).NotTo(HaveOccurred())

By("Deleting Curl Container")
err = f.K8sResourceManagers.PodManager().DeleteAndWaitTillPodDeleted(getCurlPod)
Expect(err).NotTo(HaveOccurred())
})

AfterEach(func() {
By("Getting Warm Pool Environment Variables After Test")
getWarmPoolEnvVars()

})
})
})
104 changes: 104 additions & 0 deletions test/integration/warm-pool/use_case_2_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package warm_pool

import (
"fmt"
"github.com/aws/amazon-vpc-cni-k8s/test/framework/resources/k8s/manifest"
"github.com/aws/amazon-vpc-cni-k8s/test/framework/utils"
"strconv"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

// This test replicates sawtooth behavior by adding a fixed amount of pods and removing the same fixed amount of pods
// over a preset number of iterations.
var _ = Describe("use case 2", func() {
Context("Sawtooth Fixed Add and Subtract", func() {

BeforeEach(func() {
By("Getting Warm Pool Environment Variables Before Test")
getWarmPoolEnvVars()
})

It("Scales the cluster and checks warm pool before and after", func() {
replicas := minPods

start := time.Now().Unix()

fmt.Fprintf(GinkgoWriter, "Deploying %v minimum pods\n", minPods)
deploymentSpec := manifest.NewBusyBoxDeploymentBuilder(f.Options.TestImageRegistry).
Namespace("default").
Name("busybox").
NodeName(primaryNode.Name).
Namespace(utils.DefaultTestNamespace).
Replicas(replicas).
Build()

_, err := f.K8sResourceManagers.
DeploymentManager().
CreateAndWaitTillDeploymentIsReady(deploymentSpec, utils.DefaultDeploymentReadyTimeout*5)
Expect(err).ToNot(HaveOccurred())

if minPods != 0 {
time.Sleep(sleep)
}

for i := 0; i < iterations; i++ {
By("Loop " + strconv.Itoa(i))
replicas = checkInRange(replicas + iterPods)
fmt.Fprintf(GinkgoWriter, "Scaling cluster up to %v pods\n", replicas)
quickScale(replicas)
Expect(replicas).To(Equal(busyboxPodCnt()))

replicas = checkInRange(replicas - iterPods)
fmt.Fprintf(GinkgoWriter, "Scaling cluster down to %v pods\n", replicas)
quickScale(replicas)
Expect(replicas).To(Equal(busyboxPodCnt()))
}

Expect(minPods).To(Equal(busyboxPodCnt()))

end := time.Now().Unix()

fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Start Time: %v\n", start))
fmt.Fprintf(GinkgoWriter, fmt.Sprintf("End Time: %v\n", end))

By("Starting Curl Container")
curlContainer := manifest.NewCurlContainer().
Command([]string{"sleep", "3600"}).Build()

getCurlPod := manifest.NewDefaultPodBuilder().
Name("curl-pod").
Namespace(utils.DefaultTestNamespace).
NodeName(primaryNode.Name).
HostNetwork(true).
Container(curlContainer).
Build()

testPod, err := f.K8sResourceManagers.PodManager().
CreateAndWaitTillPodCompleted(getCurlPod)

logs, errLogs := f.K8sResourceManagers.PodManager().
PodLogs(testPod.Namespace, testPod.Name)
Expect(errLogs).ToNot(HaveOccurred())
fmt.Fprintln(GinkgoWriter, logs)

By("Fetching metrics via Curl Container")
getMetrics(start, end)

By("Deleting the deployment")
err = f.K8sResourceManagers.DeploymentManager().DeleteAndWaitTillDeploymentIsDeleted(deploymentSpec)
Expect(err).NotTo(HaveOccurred())

By("Deleting Curl Container")
err = f.K8sResourceManagers.PodManager().DeleteAndWaitTillPodDeleted(getCurlPod)
Expect(err).NotTo(HaveOccurred())
})

AfterEach(func() {
By("Getting Warm Pool Environment Variables After Test")
getWarmPoolEnvVars()
})
})
})
Loading

0 comments on commit 320cd8a

Please sign in to comment.