agent: prevent very old servers re-joining a cluster with stale data (#17171)
* agent: configure server lastseen timestamp Signed-off-by: Dan Bond <danbond@protonmail.com> * use correct config Signed-off-by: Dan Bond <danbond@protonmail.com> * add comments Signed-off-by: Dan Bond <danbond@protonmail.com> * use default age in test golden data Signed-off-by: Dan Bond <danbond@protonmail.com> * add changelog Signed-off-by: Dan Bond <danbond@protonmail.com> * fix runtime test Signed-off-by: Dan Bond <danbond@protonmail.com> * agent: add server_metadata Signed-off-by: Dan Bond <danbond@protonmail.com> * update comments Signed-off-by: Dan Bond <danbond@protonmail.com> * correctly check if metadata file does not exist Signed-off-by: Dan Bond <danbond@protonmail.com> * follow instructions for adding new config Signed-off-by: Dan Bond <danbond@protonmail.com> * add comments Signed-off-by: Dan Bond <danbond@protonmail.com> * update comments Signed-off-by: Dan Bond <danbond@protonmail.com> * Update agent/agent.go Co-authored-by: Dan Upton <daniel@floppy.co> * agent/config: add validation for duration with min Signed-off-by: Dan Bond <danbond@protonmail.com> * docs: add new server_rejoin_age_max config definition Signed-off-by: Dan Bond <danbond@protonmail.com> * agent: add unit test for checking server last seen Signed-off-by: Dan Bond <danbond@protonmail.com> * agent: log continually for 60s before erroring Signed-off-by: Dan Bond <danbond@protonmail.com> * pr comments Signed-off-by: Dan Bond <danbond@protonmail.com> * remove unneeded todo * agent: fix error message Signed-off-by: Dan Bond <danbond@protonmail.com> --------- Signed-off-by: Dan Bond <danbond@protonmail.com> Co-authored-by: Dan Upton <daniel@floppy.co>
This commit is contained in:
parent
91ed8de9f5
commit
6bb7782745
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:improvement
|
||||||
|
agent: add a configurable maximimum age (default: 7 days) to prevent servers re-joining a cluster with stale data
|
||||||
|
```
|
|
@ -7,6 +7,7 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"crypto/tls"
|
"crypto/tls"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net"
|
"net"
|
||||||
|
@ -22,8 +23,6 @@ import (
|
||||||
|
|
||||||
"github.com/armon/go-metrics"
|
"github.com/armon/go-metrics"
|
||||||
"github.com/armon/go-metrics/prometheus"
|
"github.com/armon/go-metrics/prometheus"
|
||||||
"github.com/hashicorp/consul/agent/rpcclient"
|
|
||||||
"github.com/hashicorp/consul/agent/rpcclient/configentry"
|
|
||||||
"github.com/hashicorp/go-connlimit"
|
"github.com/hashicorp/go-connlimit"
|
||||||
"github.com/hashicorp/go-hclog"
|
"github.com/hashicorp/go-hclog"
|
||||||
"github.com/hashicorp/go-memdb"
|
"github.com/hashicorp/go-memdb"
|
||||||
|
@ -50,12 +49,13 @@ import (
|
||||||
grpcDNS "github.com/hashicorp/consul/agent/grpc-external/services/dns"
|
grpcDNS "github.com/hashicorp/consul/agent/grpc-external/services/dns"
|
||||||
middleware "github.com/hashicorp/consul/agent/grpc-middleware"
|
middleware "github.com/hashicorp/consul/agent/grpc-middleware"
|
||||||
"github.com/hashicorp/consul/agent/hcp/scada"
|
"github.com/hashicorp/consul/agent/hcp/scada"
|
||||||
libscada "github.com/hashicorp/consul/agent/hcp/scada"
|
|
||||||
"github.com/hashicorp/consul/agent/local"
|
"github.com/hashicorp/consul/agent/local"
|
||||||
"github.com/hashicorp/consul/agent/proxycfg"
|
"github.com/hashicorp/consul/agent/proxycfg"
|
||||||
proxycfgglue "github.com/hashicorp/consul/agent/proxycfg-glue"
|
proxycfgglue "github.com/hashicorp/consul/agent/proxycfg-glue"
|
||||||
catalogproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/catalog"
|
catalogproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/catalog"
|
||||||
localproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/local"
|
localproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/local"
|
||||||
|
"github.com/hashicorp/consul/agent/rpcclient"
|
||||||
|
"github.com/hashicorp/consul/agent/rpcclient/configentry"
|
||||||
"github.com/hashicorp/consul/agent/rpcclient/health"
|
"github.com/hashicorp/consul/agent/rpcclient/health"
|
||||||
"github.com/hashicorp/consul/agent/structs"
|
"github.com/hashicorp/consul/agent/structs"
|
||||||
"github.com/hashicorp/consul/agent/systemd"
|
"github.com/hashicorp/consul/agent/systemd"
|
||||||
|
@ -575,11 +575,11 @@ func (a *Agent) Start(ctx context.Context) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy over the existing node id, this cannot be
|
// Copy over the existing node id. This cannot be
|
||||||
// changed while running anyways but this prevents
|
// changed while running, but this prevents
|
||||||
// breaking some existing behavior. then overwrite
|
// breaking some existing behavior.
|
||||||
// the configuration
|
|
||||||
c.NodeID = a.config.NodeID
|
c.NodeID = a.config.NodeID
|
||||||
|
// Overwrite the configuration.
|
||||||
a.config = c
|
a.config = c
|
||||||
|
|
||||||
if err := a.tlsConfigurator.Update(a.config.TLS); err != nil {
|
if err := a.tlsConfigurator.Update(a.config.TLS); err != nil {
|
||||||
|
@ -625,6 +625,20 @@ func (a *Agent) Start(ctx context.Context) error {
|
||||||
if c.ServerMode {
|
if c.ServerMode {
|
||||||
serverLogger := a.baseDeps.Logger.NamedIntercept(logging.ConsulServer)
|
serverLogger := a.baseDeps.Logger.NamedIntercept(logging.ConsulServer)
|
||||||
|
|
||||||
|
// Check for a last seen timestamp and exit if deemed stale before attempting to join
|
||||||
|
// Serf/Raft or listen for requests.
|
||||||
|
if err := a.checkServerLastSeen(consul.ReadServerMetadata); err != nil {
|
||||||
|
deadline := time.Now().Add(time.Minute)
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
a.logger.Error("startup error", "error", err)
|
||||||
|
time.Sleep(10 * time.Second)
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// periodically write server metadata to disk.
|
||||||
|
go a.persistServerMetadata()
|
||||||
|
|
||||||
incomingRPCLimiter := consul.ConfiguredIncomingRPCLimiter(
|
incomingRPCLimiter := consul.ConfiguredIncomingRPCLimiter(
|
||||||
&lib.StopChannelContext{StopCh: a.shutdownCh},
|
&lib.StopChannelContext{StopCh: a.shutdownCh},
|
||||||
serverLogger,
|
serverLogger,
|
||||||
|
@ -661,7 +675,6 @@ func (a *Agent) Start(ctx context.Context) error {
|
||||||
return fmt.Errorf("failed to start server cert manager: %w", err)
|
return fmt.Errorf("failed to start server cert manager: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
a.externalGRPCServer = external.NewServer(
|
a.externalGRPCServer = external.NewServer(
|
||||||
a.logger.Named("grpc.external"),
|
a.logger.Named("grpc.external"),
|
||||||
|
@ -1094,7 +1107,7 @@ func (a *Agent) listenHTTP() ([]apiServer, error) {
|
||||||
MaxHeaderBytes: a.config.HTTPMaxHeaderBytes,
|
MaxHeaderBytes: a.config.HTTPMaxHeaderBytes,
|
||||||
}
|
}
|
||||||
|
|
||||||
if libscada.IsCapability(l.Addr()) {
|
if scada.IsCapability(l.Addr()) {
|
||||||
// wrap in http2 server handler
|
// wrap in http2 server handler
|
||||||
httpServer.Handler = h2c.NewHandler(srv.handler(a.config.EnableDebug), &http2.Server{})
|
httpServer.Handler = h2c.NewHandler(srv.handler(a.config.EnableDebug), &http2.Server{})
|
||||||
}
|
}
|
||||||
|
@ -1521,6 +1534,8 @@ func newConsulConfig(runtimeCfg *config.RuntimeConfig, logger hclog.Logger) (*co
|
||||||
|
|
||||||
cfg.Reporting.License.Enabled = runtimeCfg.Reporting.License.Enabled
|
cfg.Reporting.License.Enabled = runtimeCfg.Reporting.License.Enabled
|
||||||
|
|
||||||
|
cfg.ServerRejoinAgeMax = runtimeCfg.ServerRejoinAgeMax
|
||||||
|
|
||||||
enterpriseConsulConfig(cfg, runtimeCfg)
|
enterpriseConsulConfig(cfg, runtimeCfg)
|
||||||
|
|
||||||
return cfg, nil
|
return cfg, nil
|
||||||
|
@ -4529,7 +4544,70 @@ func (a *Agent) proxyDataSources() proxycfg.DataSources {
|
||||||
|
|
||||||
a.fillEnterpriseProxyDataSources(&sources)
|
a.fillEnterpriseProxyDataSources(&sources)
|
||||||
return sources
|
return sources
|
||||||
|
}
|
||||||
|
|
||||||
|
// persistServerMetadata periodically writes a server's metadata to a file
|
||||||
|
// in the configured data directory.
|
||||||
|
func (a *Agent) persistServerMetadata() {
|
||||||
|
file := filepath.Join(a.config.DataDir, consul.ServerMetadataFile)
|
||||||
|
|
||||||
|
// Create a timer with no initial tick to allow metadata to be written immediately.
|
||||||
|
t := time.NewTimer(0)
|
||||||
|
defer t.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-t.C:
|
||||||
|
// Reset the timer to the larger periodic interval.
|
||||||
|
t.Reset(1 * time.Hour)
|
||||||
|
|
||||||
|
f, err := consul.OpenServerMetadata(file)
|
||||||
|
if err != nil {
|
||||||
|
a.logger.Error("failed to open existing server metadata: %w", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := consul.WriteServerMetadata(f); err != nil {
|
||||||
|
f.Close()
|
||||||
|
a.logger.Error("failed to write server metadata: %w", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
f.Close()
|
||||||
|
case <-a.shutdownCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// checkServerLastSeen is a safety check that only occurs once of startup to prevent old servers
|
||||||
|
// with stale data from rejoining an existing cluster.
|
||||||
|
//
|
||||||
|
// It attempts to read a server's metadata file and check the last seen Unix timestamp against a
|
||||||
|
// configurable max age. If the metadata file does not exist, we treat this as an initial startup
|
||||||
|
// and return no error.
|
||||||
|
//
|
||||||
|
// Example: if the server recorded a last seen timestamp of now-7d, and we configure a max age
|
||||||
|
// of 3d, then we should prevent the server from rejoining.
|
||||||
|
func (a *Agent) checkServerLastSeen(readFn consul.ServerMetadataReadFunc) error {
|
||||||
|
filename := filepath.Join(a.config.DataDir, consul.ServerMetadataFile)
|
||||||
|
|
||||||
|
// Read server metadata file.
|
||||||
|
md, err := readFn(filename)
|
||||||
|
if err != nil {
|
||||||
|
// Return early if it doesn't exist as this likely indicates the server is starting for the first time.
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("error reading server metadata: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
maxAge := a.config.ServerRejoinAgeMax
|
||||||
|
if md.IsLastSeenStale(maxAge) {
|
||||||
|
return fmt.Errorf("refusing to rejoin cluster because server has been offline for more than the configured server_rejoin_age_max (%s) - consider wiping your data dir", maxAge)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func listenerPortKey(svcID structs.ServiceID, checkID structs.CheckID) string {
|
func listenerPortKey(svcID structs.ServiceID, checkID structs.CheckID) string {
|
||||||
|
|
|
@ -12,6 +12,7 @@ import (
|
||||||
"crypto/x509"
|
"crypto/x509"
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
mathrand "math/rand"
|
mathrand "math/rand"
|
||||||
"net"
|
"net"
|
||||||
|
@ -6204,6 +6205,70 @@ cloud {
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAgent_checkServerLastSeen(t *testing.T) {
|
||||||
|
bd := BaseDeps{
|
||||||
|
Deps: consul.Deps{
|
||||||
|
Logger: hclog.NewInterceptLogger(nil),
|
||||||
|
Tokens: new(token.Store),
|
||||||
|
GRPCConnPool: &fakeGRPCConnPool{},
|
||||||
|
},
|
||||||
|
RuntimeConfig: &config.RuntimeConfig{},
|
||||||
|
Cache: cache.New(cache.Options{}),
|
||||||
|
}
|
||||||
|
agent, err := New(bd)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// Test that an ErrNotExist OS error is treated as ok.
|
||||||
|
t.Run("TestReadErrNotExist", func(t *testing.T) {
|
||||||
|
readFn := func(filename string) (*consul.ServerMetadata, error) {
|
||||||
|
return nil, os.ErrNotExist
|
||||||
|
}
|
||||||
|
|
||||||
|
err := agent.checkServerLastSeen(readFn)
|
||||||
|
require.NoError(t, err)
|
||||||
|
})
|
||||||
|
|
||||||
|
// Test that an error reading server metadata is treated as an error.
|
||||||
|
t.Run("TestReadErr", func(t *testing.T) {
|
||||||
|
expected := errors.New("read error")
|
||||||
|
readFn := func(filename string) (*consul.ServerMetadata, error) {
|
||||||
|
return nil, expected
|
||||||
|
}
|
||||||
|
|
||||||
|
err := agent.checkServerLastSeen(readFn)
|
||||||
|
require.ErrorIs(t, err, expected)
|
||||||
|
})
|
||||||
|
|
||||||
|
// Test that a server with a 7d old last seen timestamp is treated as an error.
|
||||||
|
t.Run("TestIsLastSeenStaleErr", func(t *testing.T) {
|
||||||
|
agent.config.ServerRejoinAgeMax = time.Hour
|
||||||
|
|
||||||
|
readFn := func(filename string) (*consul.ServerMetadata, error) {
|
||||||
|
return &consul.ServerMetadata{
|
||||||
|
LastSeenUnix: time.Now().Add(-24 * 7 * time.Hour).Unix(),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
err := agent.checkServerLastSeen(readFn)
|
||||||
|
require.Error(t, err)
|
||||||
|
require.ErrorContains(t, err, "refusing to rejoin cluster because server has been offline for more than the configured server_rejoin_age_max")
|
||||||
|
})
|
||||||
|
|
||||||
|
// Test that a server with a 6h old last seen timestamp is not treated as an error.
|
||||||
|
t.Run("TestNoErr", func(t *testing.T) {
|
||||||
|
agent.config.ServerRejoinAgeMax = 24 * 7 * time.Hour
|
||||||
|
|
||||||
|
readFn := func(filename string) (*consul.ServerMetadata, error) {
|
||||||
|
return &consul.ServerMetadata{
|
||||||
|
LastSeenUnix: time.Now().Add(-6 * time.Hour).Unix(),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
err := agent.checkServerLastSeen(readFn)
|
||||||
|
require.NoError(t, err)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func getExpectedCaPoolByFile(t *testing.T) *x509.CertPool {
|
func getExpectedCaPoolByFile(t *testing.T) *x509.CertPool {
|
||||||
pool := x509.NewCertPool()
|
pool := x509.NewCertPool()
|
||||||
data, err := os.ReadFile("../test/ca/root.cer")
|
data, err := os.ReadFile("../test/ca/root.cer")
|
||||||
|
|
|
@ -28,8 +28,6 @@ import (
|
||||||
"github.com/hashicorp/memberlist"
|
"github.com/hashicorp/memberlist"
|
||||||
"golang.org/x/time/rate"
|
"golang.org/x/time/rate"
|
||||||
|
|
||||||
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
|
|
||||||
|
|
||||||
"github.com/hashicorp/consul/agent/cache"
|
"github.com/hashicorp/consul/agent/cache"
|
||||||
"github.com/hashicorp/consul/agent/checks"
|
"github.com/hashicorp/consul/agent/checks"
|
||||||
"github.com/hashicorp/consul/agent/connect/ca"
|
"github.com/hashicorp/consul/agent/connect/ca"
|
||||||
|
@ -37,6 +35,7 @@ import (
|
||||||
"github.com/hashicorp/consul/agent/consul/authmethod/ssoauth"
|
"github.com/hashicorp/consul/agent/consul/authmethod/ssoauth"
|
||||||
consulrate "github.com/hashicorp/consul/agent/consul/rate"
|
consulrate "github.com/hashicorp/consul/agent/consul/rate"
|
||||||
"github.com/hashicorp/consul/agent/dns"
|
"github.com/hashicorp/consul/agent/dns"
|
||||||
|
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
|
||||||
"github.com/hashicorp/consul/agent/rpc/middleware"
|
"github.com/hashicorp/consul/agent/rpc/middleware"
|
||||||
"github.com/hashicorp/consul/agent/structs"
|
"github.com/hashicorp/consul/agent/structs"
|
||||||
"github.com/hashicorp/consul/agent/token"
|
"github.com/hashicorp/consul/agent/token"
|
||||||
|
@ -1090,6 +1089,7 @@ func (b *builder) build() (rt RuntimeConfig, err error) {
|
||||||
ServerMode: serverMode,
|
ServerMode: serverMode,
|
||||||
ServerName: stringVal(c.ServerName),
|
ServerName: stringVal(c.ServerName),
|
||||||
ServerPort: serverPort,
|
ServerPort: serverPort,
|
||||||
|
ServerRejoinAgeMax: b.durationValWithDefaultMin("server_rejoin_age_max", c.ServerRejoinAgeMax, 24*7*time.Hour, 6*time.Hour),
|
||||||
Services: services,
|
Services: services,
|
||||||
SessionTTLMin: b.durationVal("session_ttl_min", c.SessionTTLMin),
|
SessionTTLMin: b.durationVal("session_ttl_min", c.SessionTTLMin),
|
||||||
SkipLeaveOnInt: skipLeaveOnInt,
|
SkipLeaveOnInt: skipLeaveOnInt,
|
||||||
|
@ -1952,6 +1952,16 @@ func (b *builder) durationValWithDefault(name string, v *string, defaultVal time
|
||||||
return d
|
return d
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// durationValWithDefaultMin is equivalent to durationValWithDefault, but enforces a minimum duration.
|
||||||
|
func (b *builder) durationValWithDefaultMin(name string, v *string, defaultVal, minVal time.Duration) (d time.Duration) {
|
||||||
|
d = b.durationValWithDefault(name, v, defaultVal)
|
||||||
|
if d < minVal {
|
||||||
|
b.err = multierror.Append(b.err, fmt.Errorf("%s: duration '%s' cannot be less than: %s", name, *v, minVal))
|
||||||
|
}
|
||||||
|
|
||||||
|
return d
|
||||||
|
}
|
||||||
|
|
||||||
func (b *builder) durationVal(name string, v *string) (d time.Duration) {
|
func (b *builder) durationVal(name string, v *string) (d time.Duration) {
|
||||||
return b.durationValWithDefault(name, v, 0)
|
return b.durationValWithDefault(name, v, 0)
|
||||||
}
|
}
|
||||||
|
|
|
@ -311,6 +311,21 @@ func TestBuilder_DurationVal_InvalidDuration(t *testing.T) {
|
||||||
require.Contains(t, b.err.Error(), badDuration2)
|
require.Contains(t, b.err.Error(), badDuration2)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuilder_DurationValWithDefaultMin(t *testing.T) {
|
||||||
|
b := builder{}
|
||||||
|
|
||||||
|
// Attempt to validate that a duration of 10 hours will not error when the min val is 1 hour.
|
||||||
|
dur := "10h0m0s"
|
||||||
|
b.durationValWithDefaultMin("field2", &dur, 24*7*time.Hour, time.Hour)
|
||||||
|
require.NoError(t, b.err)
|
||||||
|
|
||||||
|
// Attempt to validate that a duration of 1 min will error when the min val is 1 hour.
|
||||||
|
dur = "0h1m0s"
|
||||||
|
b.durationValWithDefaultMin("field1", &dur, 24*7*time.Hour, time.Hour)
|
||||||
|
require.Error(t, b.err)
|
||||||
|
require.Contains(t, b.err.Error(), "1 error")
|
||||||
|
}
|
||||||
|
|
||||||
func TestBuilder_ServiceVal_MultiError(t *testing.T) {
|
func TestBuilder_ServiceVal_MultiError(t *testing.T) {
|
||||||
b := builder{}
|
b := builder{}
|
||||||
b.serviceVal(&ServiceDefinition{
|
b.serviceVal(&ServiceDefinition{
|
||||||
|
|
|
@ -228,6 +228,7 @@ type Config struct {
|
||||||
SerfBindAddrWAN *string `mapstructure:"serf_wan" json:"serf_wan,omitempty"`
|
SerfBindAddrWAN *string `mapstructure:"serf_wan" json:"serf_wan,omitempty"`
|
||||||
ServerMode *bool `mapstructure:"server" json:"server,omitempty"`
|
ServerMode *bool `mapstructure:"server" json:"server,omitempty"`
|
||||||
ServerName *string `mapstructure:"server_name" json:"server_name,omitempty"`
|
ServerName *string `mapstructure:"server_name" json:"server_name,omitempty"`
|
||||||
|
ServerRejoinAgeMax *string `mapstructure:"server_rejoin_age_max" json:"server_rejoin_age_max,omitempty"`
|
||||||
Service *ServiceDefinition `mapstructure:"service" json:"-"`
|
Service *ServiceDefinition `mapstructure:"service" json:"-"`
|
||||||
Services []ServiceDefinition `mapstructure:"services" json:"-"`
|
Services []ServiceDefinition `mapstructure:"services" json:"-"`
|
||||||
SessionTTLMin *string `mapstructure:"session_ttl_min" json:"session_ttl_min,omitempty"`
|
SessionTTLMin *string `mapstructure:"session_ttl_min" json:"session_ttl_min,omitempty"`
|
||||||
|
|
|
@ -58,6 +58,7 @@ func DefaultSource() Source {
|
||||||
segment_limit = 64
|
segment_limit = 64
|
||||||
|
|
||||||
server = false
|
server = false
|
||||||
|
server_rejoin_age_max = "168h"
|
||||||
syslog_facility = "LOCAL0"
|
syslog_facility = "LOCAL0"
|
||||||
|
|
||||||
tls = {
|
tls = {
|
||||||
|
|
|
@ -1358,6 +1358,18 @@ type RuntimeConfig struct {
|
||||||
// hcl: ports { server = int }
|
// hcl: ports { server = int }
|
||||||
ServerPort int
|
ServerPort int
|
||||||
|
|
||||||
|
// ServerRejoinAgeMax is used to specify the duration of time a server
|
||||||
|
// is allowed to be down/offline before a startup operation is refused.
|
||||||
|
//
|
||||||
|
// For example: if a server has been offline for 5 days, and this option
|
||||||
|
// is configured to 3 days, then any subsequent startup operation will fail
|
||||||
|
// and require an operator to manually intervene.
|
||||||
|
//
|
||||||
|
// The default is: 7 days
|
||||||
|
//
|
||||||
|
// hcl: server_rejoin_age_max = "duration"
|
||||||
|
ServerRejoinAgeMax time.Duration
|
||||||
|
|
||||||
// Services contains the provided service definitions:
|
// Services contains the provided service definitions:
|
||||||
//
|
//
|
||||||
// hcl: services = [
|
// hcl: services = [
|
||||||
|
|
|
@ -25,13 +25,12 @@ import (
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
"golang.org/x/time/rate"
|
"golang.org/x/time/rate"
|
||||||
|
|
||||||
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
|
|
||||||
|
|
||||||
"github.com/hashicorp/consul/acl"
|
"github.com/hashicorp/consul/acl"
|
||||||
"github.com/hashicorp/consul/agent/cache"
|
"github.com/hashicorp/consul/agent/cache"
|
||||||
"github.com/hashicorp/consul/agent/checks"
|
"github.com/hashicorp/consul/agent/checks"
|
||||||
"github.com/hashicorp/consul/agent/consul"
|
"github.com/hashicorp/consul/agent/consul"
|
||||||
consulrate "github.com/hashicorp/consul/agent/consul/rate"
|
consulrate "github.com/hashicorp/consul/agent/consul/rate"
|
||||||
|
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
|
||||||
"github.com/hashicorp/consul/agent/structs"
|
"github.com/hashicorp/consul/agent/structs"
|
||||||
"github.com/hashicorp/consul/agent/token"
|
"github.com/hashicorp/consul/agent/token"
|
||||||
"github.com/hashicorp/consul/lib"
|
"github.com/hashicorp/consul/lib"
|
||||||
|
@ -6419,6 +6418,7 @@ func TestLoad_FullConfig(t *testing.T) {
|
||||||
SerfPortWAN: 8302,
|
SerfPortWAN: 8302,
|
||||||
ServerMode: true,
|
ServerMode: true,
|
||||||
ServerName: "Oerr9n1G",
|
ServerName: "Oerr9n1G",
|
||||||
|
ServerRejoinAgeMax: 604800 * time.Second,
|
||||||
ServerPort: 3757,
|
ServerPort: 3757,
|
||||||
Services: []*structs.ServiceDefinition{
|
Services: []*structs.ServiceDefinition{
|
||||||
{
|
{
|
||||||
|
@ -7164,6 +7164,7 @@ func TestRuntimeConfig_Sanitize(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
Locality: &Locality{Region: strPtr("us-west-1"), Zone: strPtr("us-west-1a")},
|
Locality: &Locality{Region: strPtr("us-west-1"), Zone: strPtr("us-west-1a")},
|
||||||
|
ServerRejoinAgeMax: 24 * 7 * time.Hour,
|
||||||
}
|
}
|
||||||
|
|
||||||
b, err := json.MarshalIndent(rt.Sanitized(), "", " ")
|
b, err := json.MarshalIndent(rt.Sanitized(), "", " ")
|
||||||
|
|
|
@ -332,6 +332,7 @@
|
||||||
"ServerMode": false,
|
"ServerMode": false,
|
||||||
"ServerName": "",
|
"ServerName": "",
|
||||||
"ServerPort": 0,
|
"ServerPort": 0,
|
||||||
|
"ServerRejoinAgeMax": "168h0m0s",
|
||||||
"Services": [
|
"Services": [
|
||||||
{
|
{
|
||||||
"Address": "",
|
"Address": "",
|
||||||
|
|
|
@ -394,6 +394,7 @@ serf_lan = "99.43.63.15"
|
||||||
serf_wan = "67.88.33.19"
|
serf_wan = "67.88.33.19"
|
||||||
server = true
|
server = true
|
||||||
server_name = "Oerr9n1G"
|
server_name = "Oerr9n1G"
|
||||||
|
server_rejoin_age_max = "604800s"
|
||||||
service = {
|
service = {
|
||||||
id = "dLOXpSCI"
|
id = "dLOXpSCI"
|
||||||
name = "o1ynPkp0"
|
name = "o1ynPkp0"
|
||||||
|
|
|
@ -453,6 +453,7 @@
|
||||||
"serf_wan": "67.88.33.19",
|
"serf_wan": "67.88.33.19",
|
||||||
"server": true,
|
"server": true,
|
||||||
"server_name": "Oerr9n1G",
|
"server_name": "Oerr9n1G",
|
||||||
|
"server_rejoin_age_max": "604800s",
|
||||||
"service": {
|
"service": {
|
||||||
"id": "dLOXpSCI",
|
"id": "dLOXpSCI",
|
||||||
"name": "o1ynPkp0",
|
"name": "o1ynPkp0",
|
||||||
|
|
|
@ -447,6 +447,10 @@ type Config struct {
|
||||||
|
|
||||||
// Embedded Consul Enterprise specific configuration
|
// Embedded Consul Enterprise specific configuration
|
||||||
*EnterpriseConfig
|
*EnterpriseConfig
|
||||||
|
|
||||||
|
// ServerRejoinAgeMax is used to specify the duration of time a server
|
||||||
|
// is allowed to be down/offline before a startup operation is refused.
|
||||||
|
ServerRejoinAgeMax time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Config) InPrimaryDatacenter() bool {
|
func (c *Config) InPrimaryDatacenter() bool {
|
||||||
|
@ -574,6 +578,8 @@ func DefaultConfig() *Config {
|
||||||
PeeringTestAllowPeerRegistrations: false,
|
PeeringTestAllowPeerRegistrations: false,
|
||||||
|
|
||||||
EnterpriseConfig: DefaultEnterpriseConfig(),
|
EnterpriseConfig: DefaultEnterpriseConfig(),
|
||||||
|
|
||||||
|
ServerRejoinAgeMax: 24 * 7 * time.Hour,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Increase our reap interval to 3 days instead of 24h.
|
// Increase our reap interval to 3 days instead of 24h.
|
||||||
|
|
|
@ -0,0 +1,71 @@
|
||||||
|
// Copyright (c) HashiCorp, Inc.
|
||||||
|
// SPDX-License-Identifier: MPL-2.0
|
||||||
|
|
||||||
|
package consul
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ServerMetadataFile is the name of the file on disk that server metadata
|
||||||
|
// should be written to.
|
||||||
|
const ServerMetadataFile = "server_metadata.json"
|
||||||
|
|
||||||
|
// ServerMetadata represents specific metadata about a running server.
|
||||||
|
type ServerMetadata struct {
|
||||||
|
// LastSeenUnix is the timestamp a server was last seen, in Unix format.
|
||||||
|
LastSeenUnix int64 `json:"last_seen_unix"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsLastSeenStale checks whether the last seen timestamp is older than a given duration.
|
||||||
|
func (md *ServerMetadata) IsLastSeenStale(d time.Duration) bool {
|
||||||
|
lastSeen := time.Unix(md.LastSeenUnix, 0)
|
||||||
|
maxAge := time.Now().Add(-d)
|
||||||
|
|
||||||
|
return lastSeen.Before(maxAge)
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpenServerMetadata is a helper function for opening the server metadata file
|
||||||
|
// with the correct permissions.
|
||||||
|
func OpenServerMetadata(filename string) (io.WriteCloser, error) {
|
||||||
|
return os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600)
|
||||||
|
}
|
||||||
|
|
||||||
|
type ServerMetadataReadFunc func(filename string) (*ServerMetadata, error)
|
||||||
|
|
||||||
|
// ReadServerMetadata is a helper function for reading the contents of a server
|
||||||
|
// metadata file and unmarshaling the data from JSON.
|
||||||
|
func ReadServerMetadata(filename string) (*ServerMetadata, error) {
|
||||||
|
b, err := os.ReadFile(filename)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var md ServerMetadata
|
||||||
|
if err := json.Unmarshal(b, &md); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &md, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// WriteServerMetadata writes server metadata to a file in JSON format.
|
||||||
|
func WriteServerMetadata(w io.Writer) error {
|
||||||
|
md := &ServerMetadata{
|
||||||
|
LastSeenUnix: time.Now().Unix(),
|
||||||
|
}
|
||||||
|
|
||||||
|
b, err := json.Marshal(md)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := w.Write(b); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -0,0 +1,68 @@
|
||||||
|
// Copyright (c) HashiCorp, Inc.
|
||||||
|
// SPDX-License-Identifier: MPL-2.0
|
||||||
|
|
||||||
|
package consul
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"errors"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
type mockServerMetadataWriter struct {
|
||||||
|
writeErr error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *mockServerMetadataWriter) Write(p []byte) (n int, err error) {
|
||||||
|
if m.writeErr != nil {
|
||||||
|
return 0, m.writeErr
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestServerMetadata(t *testing.T) {
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
t.Run("TestIsLastSeenStaleTrue", func(t *testing.T) {
|
||||||
|
// Create a server that is 48 hours old.
|
||||||
|
md := &ServerMetadata{
|
||||||
|
LastSeenUnix: now.Add(-48 * time.Hour).Unix(),
|
||||||
|
}
|
||||||
|
|
||||||
|
stale := md.IsLastSeenStale(24 * time.Hour)
|
||||||
|
assert.True(t, stale)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("TestIsLastSeenStaleFalse", func(t *testing.T) {
|
||||||
|
// Create a server that is 1 hour old.
|
||||||
|
md := &ServerMetadata{
|
||||||
|
LastSeenUnix: now.Add(-1 * time.Hour).Unix(),
|
||||||
|
}
|
||||||
|
|
||||||
|
stale := md.IsLastSeenStale(24 * time.Hour)
|
||||||
|
assert.False(t, stale)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWriteServerMetadata(t *testing.T) {
|
||||||
|
t.Run("TestWriteError", func(t *testing.T) {
|
||||||
|
m := &mockServerMetadataWriter{
|
||||||
|
writeErr: errors.New("write error"),
|
||||||
|
}
|
||||||
|
|
||||||
|
err := WriteServerMetadata(m)
|
||||||
|
assert.Error(t, err)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("TestOK", func(t *testing.T) {
|
||||||
|
b := new(bytes.Buffer)
|
||||||
|
|
||||||
|
err := WriteServerMetadata(b)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.True(t, b.Len() > 0)
|
||||||
|
})
|
||||||
|
}
|
|
@ -736,6 +736,11 @@ Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDurati
|
||||||
|
|
||||||
- `server` Equivalent to the [`-server` command-line flag](/consul/docs/agent/config/cli-flags#_server).
|
- `server` Equivalent to the [`-server` command-line flag](/consul/docs/agent/config/cli-flags#_server).
|
||||||
|
|
||||||
|
- `server_rejoin_age_max` - controls the allowed maximum age of a stale server attempting to rejoin a cluster.
|
||||||
|
If a server is not running for this period, then it will refuse to start up again until an operator intervenes. This is to protect
|
||||||
|
clusters from instability caused by decommissioned servers accidentally being started again.
|
||||||
|
Note: the default value is 7d and the minimum value is 6h.
|
||||||
|
|
||||||
- `non_voting_server` - **This field is deprecated in Consul 1.9.1. See the [`read_replica`](#read_replica) field instead.**
|
- `non_voting_server` - **This field is deprecated in Consul 1.9.1. See the [`read_replica`](#read_replica) field instead.**
|
||||||
|
|
||||||
- `read_replica` - Equivalent to the [`-read-replica` command-line flag](/consul/docs/agent/config/cli-flags#_read_replica).
|
- `read_replica` - Equivalent to the [`-read-replica` command-line flag](/consul/docs/agent/config/cli-flags#_read_replica).
|
||||||
|
|
Loading…
Reference in New Issue