Investigate the unreasonably fast denial latency metrics #7597

beautifulentropy · 2024-07-16T18:46:02Z

Something we are seeing in some metrics that we checked today is that consistently 50% of keys are "NotFound". We really do not expect to see this under normal operating conditions unless 50% of callers have never interacted with our key-value rate limits system before. This could explain why we're seeing super fast denials. It's not that the denials are the only data being served locally it's that the only denials we're getting are being served locally. Denials are expected to be the fastest operation because they're just one GET operation and a little math which exits really early. Where as an allowed call must GET + make newTAT + SET newTAT.

I wrote a couple of tests to ensure that the sharding implementation we're using from go-redis, when combined with our custom DNS SRV resolver, is actually doing what it's supposed to. The following test ensures that two separately configured redis.Ring clients:

are distributing writes somewhat evenly
are sharding pipeline writes consistently
are sharding pipeline reads consistently

func TestSharding(t *testing.T) {
	logger := blog.NewMock()
	ring1 := newTestRedisRing()
	ring2 := newTestRedisRing()
	lookup1, err := newLookup([]cmd.ServiceDomain{
		{
			Service: "redisratelimits",
			Domain:  "service.consul",
		},
	},
		"consul.service.consul",
		250*time.Millisecond,
		ring1,
		logger,
		metrics.NoopRegisterer,
	)
	test.AssertNotError(t, err, "Expected newLookup construction to succeed")
	lookup1.start()
	defer lookup1.stop()
	lookup2, err := newLookup([]cmd.ServiceDomain{
		{
			Service: "redisratelimits",
			Domain:  "service.consul",
		},
	},
		"consul.service.consul",
		250*time.Millisecond,
		ring2,
		logger,
		metrics.NoopRegisterer,
	)
	test.AssertNotError(t, err, "Expected newLookup construction to succeed")
	lookup2.start()
	defer lookup2.stop()

	pipeline := ring1.Pipeline()
	for i := 0; i < 1000; i++ {
		err := pipeline.Set(context.Background(), fmt.Sprintf("key%d", i), "value", 0).Err()
		test.AssertNotError(t, err, "Expected GET to succeed for all shards")
	}
	cmd, err := pipeline.Exec(context.Background())
	test.AssertNotError(t, err, "Expected EXEC to succeed for all shards")
	for _, c := range cmd {
		_, err := c.(*redis.StatusCmd).Result()
		test.AssertNotError(t, err, "Expected GET to succeed for all shards")
	}

	var shard1 *redis.Client
	var shard2 *redis.Client
	err = ring1.ForEachShard(context.Background(), func(ctx context.Context, shard *redis.Client) error {
		if shard1 == nil {
			shard1 = shard
		} else {
			shard2 = shard
		}
		return nil
	})
	test.AssertNotError(t, err, "Expected ForEach to succeed")

	shard1Keyspace := shard1.Info(context.Background(), "keyspace").Val()
	shard2Keyspace := shard2.Info(context.Background(), "keyspace").Val()

	fmt.Println(shard1.Options().Addr, shard1Keyspace)
	fmt.Println(shard2.Options().Addr, shard2Keyspace)

	// Try to read all written keys from both shards using a pipeline.
	pipeline = ring2.Pipeline()
	for i := 0; i < 1000; i++ {
		err := pipeline.Get(context.Background(), fmt.Sprintf("key%d", i)).Err()
		test.AssertNotError(t, err, "Expected GET to succeed for all shards")
	}
	cmd, err = pipeline.Exec(context.Background())
	test.AssertNotError(t, err, "Expected EXEC to succeed for all shards")
	for _, c := range cmd {
		_, err := c.(*redis.StringCmd).Result()
		test.AssertNotError(t, err, "Expected GET to succeed for all shards")
	}

	err = ring2.ForEachShard(context.Background(), func(ctx context.Context, shard *redis.Client) error {
		if shard1 == nil {
			shard1 = shard
		} else {
			shard2 = shard
		}
		return nil
	})
	test.AssertNotError(t, err, "Expected ForEach to succeed")

	shard1Keyspace = shard1.Info(context.Background(), "keyspace").Val()
	shard2Keyspace = shard2.Info(context.Background(), "keyspace").Val()

	fmt.Println(shard1.Options().Addr, shard1Keyspace)
	fmt.Println(shard2.Options().Addr, shard2Keyspace)
}

=== RUN   TestSharding
10.33.33.5:4218
db0:keys=525,expires=0,avg_ttl=0

10.33.33.4:4218
db0:keys=475,expires=0,avg_ttl=0

10.33.33.5:4218
db0:keys=525,expires=0,avg_ttl=0

10.33.33.4:4218
db0:keys=475,expires=0,avg_ttl=0

The text was updated successfully, but these errors were encountered:

beautifulentropy mentioned this issue Jul 16, 2024

Key-value based rate limiting #5545

Closed

18 tasks

beautifulentropy self-assigned this Jul 16, 2024

beautifulentropy added this to the Sprint 2024-07-16 milestone Jul 16, 2024

aarongable modified the milestones: Sprint 2024-07-16, Sprint 2024-07-23 Jul 23, 2024

beautifulentropy mentioned this issue Jul 24, 2024

ratelimits: Fix latency calculations #7627

Merged

beautifulentropy closed this as completed in #7627 Jul 25, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Investigate the unreasonably fast denial latency metrics #7597

Investigate the unreasonably fast denial latency metrics #7597

beautifulentropy commented Jul 16, 2024 •

edited

Loading

Investigate the unreasonably fast denial latency metrics #7597

Investigate the unreasonably fast denial latency metrics #7597

Comments

beautifulentropy commented Jul 16, 2024 • edited Loading

beautifulentropy commented Jul 16, 2024 •

edited

Loading