Skip to content

Commit

Permalink
fix: reduce cardinality for tcp remote address metrics (#953)
Browse files Browse the repository at this point in the history
# Description

This PR makes the following changes:

- **Remove Port Label**: The port label is removed from the TCP remote
address metrics.

- **Aggregate IP Addresses**: Instead of tracking individual IP
addresses, the metrics now use a generalized `AllIPs` string to
represent all IP addresses.


![image](https://github.com/user-attachments/assets/877e004c-efdb-47d7-b438-5f0b340fce1f)

## Related Issue

If this pull request is related to any issue, please mention it here.
Additionally, make sure that the issue is assigned to you before
submitting this pull request.

## Checklist

- [ ] I have read the [contributing
documentation](https://retina.sh/docs/contributing).
- [ ] I signed and signed-off the commits (`git commit -S -s ...`). See
[this
documentation](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification)
on signing commits.
- [ ] I have correctly attributed the author(s) of the code.
- [ ] I have tested the changes locally.
- [ ] I have followed the project's style guidelines.
- [ ] I have updated the documentation, if necessary.
- [ ] I have added tests, if applicable.

## Screenshots (if applicable) or Testing Completed

Please add any relevant screenshots or GIFs to showcase the changes
made.

## Additional Notes

Add any additional notes or context about the pull request here.

---

Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) file for more
information on how to contribute to this project.

Co-authored-by: Jacques Massa <[email protected]>
  • Loading branch information
vakalapa and jimassa authored Nov 13, 2024
1 parent e7f81ed commit 78b5dca
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 116 deletions.
1 change: 0 additions & 1 deletion pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ func InitializeMetrics() {
utils.TCPConnectionRemoteGaugeName,
tcpConnectionRemoteGaugeDescription,
utils.Address,
utils.Port,
)
TCPConnectionStatsGauge = exporter.CreatePrometheusGaugeVecForMetric(
exporter.DefaultRegistry,
Expand Down
75 changes: 10 additions & 65 deletions pkg/plugin/linuxutil/netstat_stats_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,15 @@ import (
"github.com/microsoft/retina/pkg/log"
"github.com/microsoft/retina/pkg/metrics"
"github.com/microsoft/retina/pkg/utils"
"github.com/pkg/errors"
"go.uber.org/zap"
)

const (
pathNetNetstat = "/proc/net/netstat"
pathNetSnmp = "/proc/net/snmp"
pathNetNetstat = "/proc/net/netstat"
pathNetSnmp = "/proc/net/snmp"
addrDefaultTCPRemote = "AllIPs"
)

var (
nodeIP = os.Getenv("NODE_IP")
ignoreList = []string{
"0.0.0.0",
"127.0.0.",
}
)

func init() {
// Add node IP to the ignore list
if nodeIP != "" {
ignoreList = append(ignoreList, nodeIP)
}
}

type NetstatReader struct {
l *log.ZapLogger
connStats *ConnectionStats
Expand All @@ -56,6 +41,7 @@ func NewNetstatReader(opts *NetstatOpts, ns NetstatInterface) *NetstatReader {
}

func (nr *NetstatReader) readAndUpdate() (*SocketStats, error) {
metrics.TCPConnectionRemoteGauge.WithLabelValues(addrDefaultTCPRemote).Set(0)
if err := nr.readConnectionStats(pathNetNetstat); err != nil {
return nil, err
}
Expand Down Expand Up @@ -201,28 +187,6 @@ func (nr *NetstatReader) readSockStats() error {
return err
} else {
sockStats := processSocks(socks)
// Compare existing tcp socket connections with updated ones, remove the ones that are not seen in the new sockStats map
// Log the socketByRemoteAddr map
if nr.opts.PrevTCPSockStats != nil {
for remoteAddr := range nr.opts.PrevTCPSockStats.socketByRemoteAddr {
addrPort, err := netip.ParseAddrPort(remoteAddr)
if err != nil {
return errors.Wrapf(err, "failed to parse remote address %s", remoteAddr)
}
addr := addrPort.Addr().String()
port := strconv.Itoa(int(addrPort.Port()))
if !validateRemoteAddr(addr) {
continue
}
// Check if the remote address is in the new sockStats map
if _, ok := sockStats.socketByRemoteAddr[remoteAddr]; !ok {
nr.l.Debug("Removing remote address from metrics", zap.String("remoteAddr", remoteAddr))
// If not, set the value to 0
metrics.TCPConnectionRemoteGauge.WithLabelValues(addr, port).Set(0)
}
}
}

nr.connStats.TcpSockets = *sockStats
}

Expand Down Expand Up @@ -269,36 +233,17 @@ func (nr *NetstatReader) updateMetrics() {
metrics.TCPStateGauge.WithLabelValues(state).Set(float64(v))
}

totalCount := 0
for remoteAddr, v := range nr.connStats.TcpSockets.socketByRemoteAddr {
addrPort, err := netip.ParseAddrPort(remoteAddr)
if err != nil {
nr.l.Error("Failed to parse remote address", zap.Error(err))
// only count valid remote addresses
if _, err := netip.ParseAddrPort(remoteAddr); err != nil {
nr.l.Error("failed to parse remote address", zap.String("remoteAddr", remoteAddr), zap.Error(err))
continue
}
addr := addrPort.Addr().String()
port := strconv.Itoa(int(addrPort.Port()))
if !validateRemoteAddr(addr) {
continue
}

metrics.TCPConnectionRemoteGauge.WithLabelValues(addr, port).Set(float64(v))
totalCount += v
}
metrics.TCPConnectionRemoteGauge.WithLabelValues(addrDefaultTCPRemote).Set(float64(totalCount))

// UDP COnnection State metrics
metrics.UDPConnectionStatsGauge.WithLabelValues(utils.Active).Set(float64(nr.connStats.UdpSockets.totalActiveSockets))
}

func validateRemoteAddr(addr string) bool {
if addr == "" {
return false
}

// check if the address is in the ignore list
for _, addressToIgnore := range ignoreList {
if strings.HasPrefix(addr, addressToIgnore) {
return false
}
}

return true
}
50 changes: 0 additions & 50 deletions pkg/plugin/linuxutil/netstat_stats_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@ package linuxutil
import (
"fmt"
"net"
"os"
"testing"

"github.com/cakturk/go-netstat/netstat"
"github.com/microsoft/retina/pkg/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
gomock "go.uber.org/mock/gomock"
)

Expand Down Expand Up @@ -337,51 +335,3 @@ func TestReadSockStats(t *testing.T) {

nr.updateMetrics()
}

// Test IP that belongs to a closed connection being removed from the metrics
func TestReadSockStatsRemoveClosedConnection(t *testing.T) {
// Set os env variable to enable debug logs
os.Setenv("NODE_IP", "10.224.0.6")

_, err := log.SetupZapLogger(log.GetDefaultLogOpts())
require.NoError(t, err)
opts := &NetstatOpts{
CuratedKeys: false,
AddZeroVal: false,
ListenSock: false,
}

ctrl := gomock.NewController(t)
defer ctrl.Finish()

ns := NewMockNetstatInterface(ctrl)
nr := NewNetstatReader(opts, ns)
assert.NotNil(t, nr)
InitalizeMetricsForTesting(ctrl)

testmetric := prometheus.NewGauge(prometheus.GaugeOpts{
Name: "testmetric",
Help: "testmetric",
})

// Initial value
nr.opts.PrevTCPSockStats = &SocketStats{
socketByRemoteAddr: map[string]int{
"192.168.1.100:80": 1,
},
}

// Latest read would not contain the IP in PrevTCPSockStats
ns.EXPECT().TCPSocks(gomock.Any()).Return([]netstat.SockTabEntry{}, nil).Times(1)
ns.EXPECT().UDPSocks(gomock.Any()).Return([]netstat.SockTabEntry{}, nil).Times(1)

// We are expecting the gauge to be called once for this value as it is removed
MockGaugeVec.EXPECT().WithLabelValues("192.168.1.100", "80").Return(testmetric).Times(1)

// We are not expecting the gauge to be called for localhost or node IP
MockGaugeVec.EXPECT().WithLabelValues("127.0.0.1", "80").Return(testmetric).Times(0)
MockGaugeVec.EXPECT().WithLabelValues("10.224.0.6", "80").Return(testmetric).Times(0)

err = nr.readSockStats()
require.NoError(t, err)
}

0 comments on commit 78b5dca

Please sign in to comment.