From f4257f91f6e4de16989f85d35cfd2559465b1d40 Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Tue, 16 Feb 2021 15:17:54 +0000 Subject: [PATCH 1/2] Set gRPC keepalives to mirror Yamux keepalive behaviour --- agent/grpc/client.go | 19 ++++++++++++++++++- agent/grpc/handler.go | 5 +++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/agent/grpc/client.go b/agent/grpc/client.go index 9fdfa54e6..8e4387382 100644 --- a/agent/grpc/client.go +++ b/agent/grpc/client.go @@ -5,8 +5,10 @@ import ( "fmt" "net" "sync" + "time" "google.golang.org/grpc" + "google.golang.org/grpc/keepalive" "github.com/hashicorp/consul/agent/metadata" "github.com/hashicorp/consul/agent/pool" @@ -64,7 +66,22 @@ func (c *ClientConnPool) ClientConn(datacenter string) (*grpc.ClientConn, error) grpc.WithDisableRetry(), grpc.WithStatsHandler(newStatsHandler(defaultMetrics())), // nolint:staticcheck // there is no other supported alternative to WithBalancerName - grpc.WithBalancerName("pick_first")) + grpc.WithBalancerName("pick_first"), + // Keep alive parameters are based on the same default ones we used for + // Yamux. These are somewhat arbitrary but we did observe in scale testing + // that the gRPC defaults (servers send keepalives only every 2 hours, + // clients never) seemed to result in TCP drops going undetected until + // actual updates needed to be sent which caused unnecessary delays for + // deliveries. These settings should be no more work for servers than + // existing yamux clients but hopefully allow TCP drops to be detected + // earlier and so have a smaller chance of going unnoticed until there are + // actual updates to send out from the servers. The servers have a policy to + // not accept pings any faster than once every 15 seconds to protect against + // abuse. + grpc.WithKeepaliveParams(keepalive.ClientParameters{ + Time: 30 * time.Second, + Timeout: 10 * time.Second, + })) if err != nil { return nil, err } diff --git a/agent/grpc/handler.go b/agent/grpc/handler.go index 388954dc4..53705b4dd 100644 --- a/agent/grpc/handler.go +++ b/agent/grpc/handler.go @@ -6,8 +6,10 @@ package grpc import ( "fmt" "net" + "time" "google.golang.org/grpc" + "google.golang.org/grpc/keepalive" ) // NewHandler returns a gRPC server that accepts connections from Handle(conn). @@ -20,6 +22,9 @@ func NewHandler(addr net.Addr, register func(server *grpc.Server)) *Handler { srv := grpc.NewServer( grpc.StatsHandler(newStatsHandler(metrics)), grpc.StreamInterceptor((&activeStreamCounter{metrics: metrics}).Intercept), + grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{ + MinTime: 15 * time.Second, + }), ) register(srv) From 571324ad08e3fbfc485c07a0c6d59939e966d98e Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Thu, 18 Feb 2021 16:13:51 +0000 Subject: [PATCH 2/2] Tune streaming backoff on errors to retry a bit faster when TCP connections drop --- agent/cache-types/streaming_health_services.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/agent/cache-types/streaming_health_services.go b/agent/cache-types/streaming_health_services.go index 649138345..8305c54e5 100644 --- a/agent/cache-types/streaming_health_services.go +++ b/agent/cache-types/streaming_health_services.go @@ -113,9 +113,13 @@ func newMaterializer( Logger: deps.Logger, Waiter: &retry.Waiter{ MinFailures: 1, - MinWait: 0, - MaxWait: 60 * time.Second, - Jitter: retry.NewJitter(100), + // Start backing off with small increments (200-400ms) which will double + // each attempt. (200-400, 400-800, 800-1600, 1600-3200, 3200-6000, 6000 + // after that). (retry.Wait applies Max limit after jitter right now). + Factor: 200 * time.Millisecond, + MinWait: 0, + MaxWait: 60 * time.Second, + Jitter: retry.NewJitter(100), }, Request: newRequestFn, }), nil