Add grpc keepalive configuration. (#19339) Prior to the introduction of this configuration, grpc keepalive messages were sent after 2 hours of inactivity on the stream. This posed issues in various scenarios where the server-side xds connection balancing was unaware that envoy instances were uncleanly killed / force-closed, since the connections would only be cleaned up after ~5 minutes of TCP timeouts occurred. Setting this config to a 30 second interval with a 20 second timeout ensures that at most, it should take up to 50 seconds for a dead xds connection to be closed.
This commit is contained in:
parent
d547958f2c
commit
19f9de2224
|
@ -0,0 +1,4 @@
|
||||||
|
```release-note:bug
|
||||||
|
connect: Fix bug where uncleanly closed xDS connections would influence connection balancing for too long and prevent envoy instances from starting. Two new configuration fields
|
||||||
|
`performance.grpc_keepalive_timeout` and `performance.grpc_keepalive_interval` now exist to allow for configuration on how often these dead connections will be cleaned up.
|
||||||
|
```
|
|
@ -34,6 +34,7 @@ import (
|
||||||
"golang.org/x/net/http2"
|
"golang.org/x/net/http2"
|
||||||
"golang.org/x/net/http2/h2c"
|
"golang.org/x/net/http2/h2c"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
|
"google.golang.org/grpc/keepalive"
|
||||||
|
|
||||||
"github.com/hashicorp/consul/acl"
|
"github.com/hashicorp/consul/acl"
|
||||||
"github.com/hashicorp/consul/acl/resolver"
|
"github.com/hashicorp/consul/acl/resolver"
|
||||||
|
@ -678,6 +679,10 @@ func (a *Agent) Start(ctx context.Context) error {
|
||||||
metrics.Default(),
|
metrics.Default(),
|
||||||
a.tlsConfigurator,
|
a.tlsConfigurator,
|
||||||
incomingRPCLimiter,
|
incomingRPCLimiter,
|
||||||
|
keepalive.ServerParameters{
|
||||||
|
Time: a.config.GRPCKeepaliveInterval,
|
||||||
|
Timeout: a.config.GRPCKeepaliveTimeout,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
server, err := consul.NewServer(consulCfg, a.baseDeps.Deps, a.externalGRPCServer, incomingRPCLimiter, serverLogger)
|
server, err := consul.NewServer(consulCfg, a.baseDeps.Deps, a.externalGRPCServer, incomingRPCLimiter, serverLogger)
|
||||||
|
@ -709,6 +714,10 @@ func (a *Agent) Start(ctx context.Context) error {
|
||||||
metrics.Default(),
|
metrics.Default(),
|
||||||
a.tlsConfigurator,
|
a.tlsConfigurator,
|
||||||
rpcRate.NullRequestLimitsHandler(),
|
rpcRate.NullRequestLimitsHandler(),
|
||||||
|
keepalive.ServerParameters{
|
||||||
|
Time: a.config.GRPCKeepaliveInterval,
|
||||||
|
Timeout: a.config.GRPCKeepaliveTimeout,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
client, err := consul.NewClient(consulCfg, a.baseDeps.Deps)
|
client, err := consul.NewClient(consulCfg, a.baseDeps.Deps)
|
||||||
|
|
|
@ -1018,6 +1018,8 @@ func (b *builder) build() (rt RuntimeConfig, err error) {
|
||||||
GRPCPort: grpcPort,
|
GRPCPort: grpcPort,
|
||||||
GRPCTLSAddrs: grpcTlsAddrs,
|
GRPCTLSAddrs: grpcTlsAddrs,
|
||||||
GRPCTLSPort: grpcTlsPort,
|
GRPCTLSPort: grpcTlsPort,
|
||||||
|
GRPCKeepaliveInterval: b.durationValWithDefaultMin("performance.grpc_keepalive_interval", c.Performance.GRPCKeepaliveInterval, 30*time.Second, time.Second),
|
||||||
|
GRPCKeepaliveTimeout: b.durationValWithDefaultMin("performance.grpc_keepalive_timeout", c.Performance.GRPCKeepaliveTimeout, 20*time.Second, time.Second),
|
||||||
HTTPMaxConnsPerClient: intVal(c.Limits.HTTPMaxConnsPerClient),
|
HTTPMaxConnsPerClient: intVal(c.Limits.HTTPMaxConnsPerClient),
|
||||||
HTTPSHandshakeTimeout: b.durationVal("limits.https_handshake_timeout", c.Limits.HTTPSHandshakeTimeout),
|
HTTPSHandshakeTimeout: b.durationVal("limits.https_handshake_timeout", c.Limits.HTTPSHandshakeTimeout),
|
||||||
KVMaxValueSize: uint64Val(c.Limits.KVMaxValueSize),
|
KVMaxValueSize: uint64Val(c.Limits.KVMaxValueSize),
|
||||||
|
|
|
@ -673,9 +673,11 @@ type HTTPConfig struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
type Performance struct {
|
type Performance struct {
|
||||||
LeaveDrainTime *string `mapstructure:"leave_drain_time"`
|
LeaveDrainTime *string `mapstructure:"leave_drain_time"`
|
||||||
RaftMultiplier *int `mapstructure:"raft_multiplier"` // todo(fs): validate as uint
|
RaftMultiplier *int `mapstructure:"raft_multiplier"` // todo(fs): validate as uint
|
||||||
RPCHoldTimeout *string `mapstructure:"rpc_hold_timeout"`
|
RPCHoldTimeout *string `mapstructure:"rpc_hold_timeout"`
|
||||||
|
GRPCKeepaliveInterval *string `mapstructure:"grpc_keepalive_interval"`
|
||||||
|
GRPCKeepaliveTimeout *string `mapstructure:"grpc_keepalive_timeout"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Telemetry struct {
|
type Telemetry struct {
|
||||||
|
|
|
@ -118,6 +118,8 @@ func DefaultSource() Source {
|
||||||
leave_drain_time = "5s"
|
leave_drain_time = "5s"
|
||||||
raft_multiplier = ` + strconv.Itoa(int(consul.DefaultRaftMultiplier)) + `
|
raft_multiplier = ` + strconv.Itoa(int(consul.DefaultRaftMultiplier)) + `
|
||||||
rpc_hold_timeout = "7s"
|
rpc_hold_timeout = "7s"
|
||||||
|
grpc_keepalive_interval = "30s"
|
||||||
|
grpc_keepalive_timeout = "20s"
|
||||||
}
|
}
|
||||||
ports = {
|
ports = {
|
||||||
dns = 8600
|
dns = 8600
|
||||||
|
|
|
@ -717,6 +717,19 @@ type RuntimeConfig struct {
|
||||||
// hcl: client_addr = string addresses { grpc_tls = string } ports { grpc_tls = int }
|
// hcl: client_addr = string addresses { grpc_tls = string } ports { grpc_tls = int }
|
||||||
GRPCTLSAddrs []net.Addr
|
GRPCTLSAddrs []net.Addr
|
||||||
|
|
||||||
|
// GRPCKeepaliveInterval determines how frequently an HTTP2 keepalive will be broadcast
|
||||||
|
// whenever a GRPC connection is idle. This helps detect xds connections that have died.
|
||||||
|
//
|
||||||
|
// Since the xds load balancing between servers relies on knowing how many connections
|
||||||
|
// are active, this configuration ensures that they are routinely detected / cleaned up
|
||||||
|
// on an interval.
|
||||||
|
GRPCKeepaliveInterval time.Duration
|
||||||
|
|
||||||
|
// GRPCKeepaliveTimeout specifies how long a GRPC client has to reply to the keepalive
|
||||||
|
// messages spawned from GRPCKeepaliveInterval. If a client does not reply in this amount of
|
||||||
|
// time, the connection will be closed by the server.
|
||||||
|
GRPCKeepaliveTimeout time.Duration
|
||||||
|
|
||||||
// HTTPAddrs contains the list of TCP addresses and UNIX sockets the HTTP
|
// HTTPAddrs contains the list of TCP addresses and UNIX sockets the HTTP
|
||||||
// server will bind to. If the HTTP endpoint is disabled (ports.http <= 0)
|
// server will bind to. If the HTTP endpoint is disabled (ports.http <= 0)
|
||||||
// the list is empty.
|
// the list is empty.
|
||||||
|
|
|
@ -6430,6 +6430,8 @@ func TestLoad_FullConfig(t *testing.T) {
|
||||||
GRPCAddrs: []net.Addr{tcpAddr("32.31.61.91:4881")},
|
GRPCAddrs: []net.Addr{tcpAddr("32.31.61.91:4881")},
|
||||||
GRPCTLSPort: 5201,
|
GRPCTLSPort: 5201,
|
||||||
GRPCTLSAddrs: []net.Addr{tcpAddr("23.14.88.19:5201")},
|
GRPCTLSAddrs: []net.Addr{tcpAddr("23.14.88.19:5201")},
|
||||||
|
GRPCKeepaliveInterval: 33 * time.Second,
|
||||||
|
GRPCKeepaliveTimeout: 22 * time.Second,
|
||||||
HTTPAddrs: []net.Addr{tcpAddr("83.39.91.39:7999")},
|
HTTPAddrs: []net.Addr{tcpAddr("83.39.91.39:7999")},
|
||||||
HTTPBlockEndpoints: []string{"RBvAFcGD", "fWOWFznh"},
|
HTTPBlockEndpoints: []string{"RBvAFcGD", "fWOWFznh"},
|
||||||
AllowWriteHTTPFrom: []*net.IPNet{cidr("127.0.0.0/8"), cidr("22.33.44.55/32"), cidr("0.0.0.0/0")},
|
AllowWriteHTTPFrom: []*net.IPNet{cidr("127.0.0.0/8"), cidr("22.33.44.55/32"), cidr("0.0.0.0/0")},
|
||||||
|
|
|
@ -209,6 +209,8 @@
|
||||||
"GRPCPort": 0,
|
"GRPCPort": 0,
|
||||||
"GRPCTLSAddrs": [],
|
"GRPCTLSAddrs": [],
|
||||||
"GRPCTLSPort": 0,
|
"GRPCTLSPort": 0,
|
||||||
|
"GRPCKeepaliveInterval": "0s",
|
||||||
|
"GRPCKeepaliveTimeout": "0s",
|
||||||
"GossipLANGossipInterval": "0s",
|
"GossipLANGossipInterval": "0s",
|
||||||
"GossipLANGossipNodes": 0,
|
"GossipLANGossipNodes": 0,
|
||||||
"GossipLANProbeInterval": "0s",
|
"GossipLANProbeInterval": "0s",
|
||||||
|
|
|
@ -335,6 +335,8 @@ performance {
|
||||||
leave_drain_time = "8265s"
|
leave_drain_time = "8265s"
|
||||||
raft_multiplier = 5
|
raft_multiplier = 5
|
||||||
rpc_hold_timeout = "15707s"
|
rpc_hold_timeout = "15707s"
|
||||||
|
grpc_keepalive_interval = "33s"
|
||||||
|
grpc_keepalive_timeout = "22s"
|
||||||
}
|
}
|
||||||
pid_file = "43xN80Km"
|
pid_file = "43xN80Km"
|
||||||
ports {
|
ports {
|
||||||
|
|
|
@ -383,7 +383,9 @@
|
||||||
"performance": {
|
"performance": {
|
||||||
"leave_drain_time": "8265s",
|
"leave_drain_time": "8265s",
|
||||||
"raft_multiplier": 5,
|
"raft_multiplier": 5,
|
||||||
"rpc_hold_timeout": "15707s"
|
"rpc_hold_timeout": "15707s",
|
||||||
|
"grpc_keepalive_interval": "33s",
|
||||||
|
"grpc_keepalive_timeout": "22s"
|
||||||
},
|
},
|
||||||
"pid_file": "43xN80Km",
|
"pid_file": "43xN80Km",
|
||||||
"ports": {
|
"ports": {
|
||||||
|
|
|
@ -26,6 +26,7 @@ import (
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
"golang.org/x/time/rate"
|
"golang.org/x/time/rate"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
|
"google.golang.org/grpc/keepalive"
|
||||||
|
|
||||||
"github.com/hashicorp/consul-net-rpc/net/rpc"
|
"github.com/hashicorp/consul-net-rpc/net/rpc"
|
||||||
|
|
||||||
|
@ -335,7 +336,7 @@ func newServerWithDeps(t *testing.T, c *Config, deps Deps) (*Server, error) {
|
||||||
oldNotify()
|
oldNotify()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
grpcServer := external.NewServer(deps.Logger.Named("grpc.external"), nil, deps.TLSConfigurator, rpcRate.NullRequestLimitsHandler())
|
grpcServer := external.NewServer(deps.Logger.Named("grpc.external"), nil, deps.TLSConfigurator, rpcRate.NullRequestLimitsHandler(), keepalive.ServerParameters{})
|
||||||
srv, err := NewServer(c, deps, grpcServer, nil, deps.Logger)
|
srv, err := NewServer(c, deps, grpcServer, nil, deps.Logger)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
|
@ -27,7 +27,13 @@ var (
|
||||||
|
|
||||||
// NewServer constructs a gRPC server for the external gRPC port, to which
|
// NewServer constructs a gRPC server for the external gRPC port, to which
|
||||||
// handlers can be registered.
|
// handlers can be registered.
|
||||||
func NewServer(logger agentmiddleware.Logger, metricsObj *metrics.Metrics, tls *tlsutil.Configurator, limiter rate.RequestLimitsHandler) *grpc.Server {
|
func NewServer(
|
||||||
|
logger agentmiddleware.Logger,
|
||||||
|
metricsObj *metrics.Metrics,
|
||||||
|
tls *tlsutil.Configurator,
|
||||||
|
limiter rate.RequestLimitsHandler,
|
||||||
|
keepaliveParams keepalive.ServerParameters,
|
||||||
|
) *grpc.Server {
|
||||||
if metricsObj == nil {
|
if metricsObj == nil {
|
||||||
metricsObj = metrics.Default()
|
metricsObj = metrics.Default()
|
||||||
}
|
}
|
||||||
|
@ -56,6 +62,7 @@ func NewServer(logger agentmiddleware.Logger, metricsObj *metrics.Metrics, tls *
|
||||||
grpc.StatsHandler(agentmiddleware.NewStatsHandler(metricsObj, metricsLabels)),
|
grpc.StatsHandler(agentmiddleware.NewStatsHandler(metricsObj, metricsLabels)),
|
||||||
middleware.WithUnaryServerChain(unaryInterceptors...),
|
middleware.WithUnaryServerChain(unaryInterceptors...),
|
||||||
middleware.WithStreamServerChain(streamInterceptors...),
|
middleware.WithStreamServerChain(streamInterceptors...),
|
||||||
|
grpc.KeepaliveParams(keepaliveParams),
|
||||||
grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{
|
grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{
|
||||||
// This must be less than the keealive.ClientParameters Time setting, otherwise
|
// This must be less than the keealive.ClientParameters Time setting, otherwise
|
||||||
// the server will disconnect the client for sending too many keepalive pings.
|
// the server will disconnect the client for sending too many keepalive pings.
|
||||||
|
|
|
@ -14,6 +14,7 @@ import (
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
|
"google.golang.org/grpc/keepalive"
|
||||||
|
|
||||||
"github.com/hashicorp/go-hclog"
|
"github.com/hashicorp/go-hclog"
|
||||||
|
|
||||||
|
@ -27,7 +28,7 @@ import (
|
||||||
func TestServer_EmitsStats(t *testing.T) {
|
func TestServer_EmitsStats(t *testing.T) {
|
||||||
sink, metricsObj := testutil.NewFakeSink(t)
|
sink, metricsObj := testutil.NewFakeSink(t)
|
||||||
|
|
||||||
srv := NewServer(hclog.Default(), metricsObj, nil, rate.NullRequestLimitsHandler())
|
srv := NewServer(hclog.Default(), metricsObj, nil, rate.NullRequestLimitsHandler(), keepalive.ServerParameters{})
|
||||||
|
|
||||||
testservice.RegisterSimpleServer(srv, &testservice.Simple{})
|
testservice.RegisterSimpleServer(srv, &testservice.Simple{})
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ import (
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
gogrpc "google.golang.org/grpc"
|
gogrpc "google.golang.org/grpc"
|
||||||
"google.golang.org/grpc/codes"
|
"google.golang.org/grpc/codes"
|
||||||
|
"google.golang.org/grpc/keepalive"
|
||||||
"google.golang.org/grpc/metadata"
|
"google.golang.org/grpc/metadata"
|
||||||
grpcstatus "google.golang.org/grpc/status"
|
grpcstatus "google.golang.org/grpc/status"
|
||||||
"google.golang.org/protobuf/proto"
|
"google.golang.org/protobuf/proto"
|
||||||
|
@ -1816,7 +1817,7 @@ func newTestServer(t *testing.T, cb func(conf *consul.Config)) testingServer {
|
||||||
conf.ACLResolverSettings.EnterpriseMeta = *conf.AgentEnterpriseMeta()
|
conf.ACLResolverSettings.EnterpriseMeta = *conf.AgentEnterpriseMeta()
|
||||||
|
|
||||||
deps := newDefaultDeps(t, conf)
|
deps := newDefaultDeps(t, conf)
|
||||||
externalGRPCServer := external.NewServer(deps.Logger, nil, deps.TLSConfigurator, rate.NullRequestLimitsHandler())
|
externalGRPCServer := external.NewServer(deps.Logger, nil, deps.TLSConfigurator, rate.NullRequestLimitsHandler(), keepalive.ServerParameters{})
|
||||||
|
|
||||||
server, err := consul.NewServer(conf, deps, externalGRPCServer, nil, deps.Logger)
|
server, err := consul.NewServer(conf, deps, externalGRPCServer, nil, deps.Logger)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
|
@ -607,6 +607,10 @@ Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDurati
|
||||||
This was added in Consul 1.0. Must be a duration value such as 10s. Defaults
|
This was added in Consul 1.0. Must be a duration value such as 10s. Defaults
|
||||||
to 7s.
|
to 7s.
|
||||||
|
|
||||||
|
- `grpc_keepalive_interval` - A duration that determines the frequency that Consul servers send keep-alive messages to inactive gRPC clients. Configure this setting to modify how quickly Consul detects and removes improperly closed xDS or peering connections. Default is `30s`.
|
||||||
|
|
||||||
|
- `grpc_keepalive_timeout` - A duration that determines how long a Consul server waits for a reply to a keep-alive message. If the server does not receive a reply before the end of the duration, Consul flags the gRPC connection as unhealthy and forcibly removes it. Defaults to `20s`.
|
||||||
|
|
||||||
- `pid_file` Equivalent to the [`-pid-file` command line flag](/consul/docs/agent/config/cli-flags#_pid_file).
|
- `pid_file` Equivalent to the [`-pid-file` command line flag](/consul/docs/agent/config/cli-flags#_pid_file).
|
||||||
|
|
||||||
- `ports` This is a nested object that allows setting the bind ports for the following keys:
|
- `ports` This is a nested object that allows setting the bind ports for the following keys:
|
||||||
|
|
Loading…
Reference in New Issue